update hotword finetuning (#1452)
* update hotword finetuning
* update scripts
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method1, finetune from model hub |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="/Users/zhifu/funasr1.0/data/list" |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="/Users/zhifu/exp" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | funasr/bin/train.py \ |
| | | ++model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \ |
| | | ++model_revision="v2.0.5" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method2, finetune from local model |
| | | |
| | | workspace=`pwd` |
| | | |
| | | echo "current path: ${workspace}" # /xxxx/funasr/examples/industrial_data_pretraining/paraformer |
| | | |
| | | # download model |
| | | local_path_root=${workspace}/modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | git clone https://www.modelscope.cn/iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404.git ${local_path} |
| | | |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | tokens="${local_path}/tokens.json" |
| | | cmvn_file="${local_path}/am.mvn" |
| | | |
| | | # output dir |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | config_name="config.yaml" |
| | | |
| | | init_param="${local_path}/model.pt" |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | ../../../funasr/bin/train.py \ |
| | | --config-path "${local_path}" \ |
| | | --config-name "${config_name}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++tokenizer_conf.token_list="${tokens}" \ |
| | | ++frontend_conf.cmvn_file="${cmvn_file}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++init_param="${init_param}" \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method1, finetune from model hub |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="/Users/zhifu/funasr1.0/data/list" |
| | | |
| | | ## generate jsonl from wav.scp and text.txt |
| | | #python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | #++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ |
| | | #++data_type_list='["source", "target"]' \ |
| | | #++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | |
| | | # exp output dir |
| | | output_dir="/Users/zhifu/exp" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | funasr/bin/train.py \ |
| | | ++model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ |
| | | ++model_revision="v2.0.6" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| New file |
| | |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | # method2, finetune from local model |
| | | |
| | | workspace=`pwd` |
| | | |
| | | echo "current path: ${workspace}" # /xxxx/funasr/examples/industrial_data_pretraining/paraformer |
| | | |
| | | # download model |
| | | local_path_root=${workspace}/modelscope_models |
| | | mkdir -p ${local_path_root} |
| | | local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | git clone https://www.modelscope.cn/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} |
| | | |
| | | |
| | | # which gpu to train or finetune |
| | | export CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | |
| | | # data dir, which contains: train.json, val.json |
| | | data_dir="../../../data/list" |
| | | |
| | | train_data="${data_dir}/train.jsonl" |
| | | val_data="${data_dir}/val.jsonl" |
| | | |
| | | |
| | | # generate train.jsonl and val.jsonl from wav.scp and text.txt |
| | | python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${train_data}" |
| | | |
| | | python -m funasr.datasets.audio_datasets.scp2jsonl \ |
| | | ++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \ |
| | | ++data_type_list='["source", "target"]' \ |
| | | ++jsonl_file_out="${val_data}" |
| | | |
| | | |
| | | tokens="${local_path}/tokens.json" |
| | | cmvn_file="${local_path}/am.mvn" |
| | | |
| | | # output dir |
| | | output_dir="./outputs" |
| | | log_file="${output_dir}/log.txt" |
| | | |
| | | config_name="config.yaml" |
| | | |
| | | init_param="${local_path}/model.pt" |
| | | |
| | | mkdir -p ${output_dir} |
| | | echo "log_file: ${log_file}" |
| | | |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | ../../../funasr/bin/train.py \ |
| | | --config-path "${local_path}" \ |
| | | --config-name "${config_name}" \ |
| | | ++train_data_set_list="${train_data}" \ |
| | | ++valid_data_set_list="${val_data}" \ |
| | | ++tokenizer_conf.token_list="${tokens}" \ |
| | | ++frontend_conf.cmvn_file="${cmvn_file}" \ |
| | | ++dataset_conf.batch_size=32 \ |
| | | ++dataset_conf.batch_type="example" \ |
| | | ++dataset_conf.num_workers=4 \ |
| | | ++train_conf.max_epoch=20 \ |
| | | ++optim_conf.lr=0.0002 \ |
| | | ++train_conf.log_interval=1 \ |
| | | ++init_param="${init_param}" \ |
| | | ++output_dir="${output_dir}" &> ${log_file} |
| | |
| | | import torch |
| | | import random |
| | | |
| | | from funasr.register import tables |
| | | from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video |
| | |
| | | outputs[key] = torch.nn.utils.rnn.pad_sequence(data_list, batch_first=True, padding_value=pad_value) |
| | | return outputs |
| | | |
| | | |
| | | @tables.register("dataset_classes", "AudioDatasetHotword") |
| | | class AudioDatasetHotword(AudioDataset): |
| | | # for finetuning contextual_paraformer and seaco_paraformer |
| | | def __init__( |
| | | self, |
| | | *args, |
| | | seaco_id: bool = 0, |
| | | **kwargs, |
| | | ): |
| | | super().__init__(*args, **kwargs) |
| | | self.seaco_id = seaco_id |
| | | |
| | | def __getitem__(self, index): |
| | | item = self.index_ds[index] |
| | | # import pdb; |
| | | # pdb.set_trace() |
| | | source = item["source"] |
| | | data_src = load_audio_text_image_video(source, fs=self.fs) |
| | | if self.preprocessor_speech: |
| | | data_src = self.preprocessor_speech(data_src, fs=self.fs) |
| | | speech, speech_lengths = extract_fbank(data_src, data_type=self.data_type, frontend=self.frontend, is_final=True) # speech: [b, T, d] |
| | | |
| | | target = item["target"] |
| | | if self.preprocessor_text: |
| | | target = self.preprocessor_text(target) |
| | | if self.tokenizer: |
| | | ids = self.tokenizer.encode(target) |
| | | text = torch.tensor(ids, dtype=torch.int64) |
| | | else: |
| | | ids = target |
| | | text = ids |
| | | ids_lengths = len(ids) |
| | | text_lengths = torch.tensor([ids_lengths], dtype=torch.int32) |
| | | |
| | | def generate_index(length, |
| | | hotword_min_length=2, |
| | | hotword_max_length=8, |
| | | sample_rate=0.75, |
| | | double_rate=0.1, |
| | | pre_prob=0.0, |
| | | pre_index=None, |
| | | pre_hwlist=None): |
| | | if length < hotword_min_length: |
| | | return [-1] |
| | | if random.random() < sample_rate: |
| | | if pre_prob > 0 and random.random() < pre_prob and pre_index is not None: |
| | | return pre_index |
| | | if length == hotword_min_length: |
| | | return [0, length-1] |
| | | elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2: |
| | | # sample two hotwords in a sentence |
| | | _max_hw_length = min(hotword_max_length, length // 2) |
| | | # first hotword |
| | | start1 = random.randint(0, length // 3) |
| | | end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1) |
| | | # second hotword |
| | | start2 = random.randint(end1 + 1, length - hotword_min_length) |
| | | end2 = random.randint(min(length-1, start2+hotword_min_length-1), min(length-1, start2+hotword_max_length-1)) |
| | | return [start1, end1, start2, end2] |
| | | else: # single hotword |
| | | start = random.randint(0, length - hotword_min_length) |
| | | end = random.randint(min(length-1, start+hotword_min_length-1), min(length-1, start+hotword_max_length-1)) |
| | | return [start, end] |
| | | else: |
| | | return [-1] |
| | | |
| | | hotword_indx = generate_index(text_lengths[0]) |
| | | return {"speech": speech[0, :, :], |
| | | "speech_lengths": speech_lengths, |
| | | "text": text, |
| | | "text_lengths": text_lengths, |
| | | "hotword_indx": hotword_indx, |
| | | "seaco_id": self.seaco_id, |
| | | } |
| | | |
| | | def collator(self, samples: list=None): |
| | | outputs = {} |
| | | hotword_indxs = [] |
| | | seaco_id = samples[0]['seaco_id'] |
| | | for sample in samples: |
| | | for key in sample.keys(): |
| | | if key == 'seaco_id': |
| | | continue |
| | | elif key == 'hotword_indx': |
| | | hotword_indxs.append(sample[key]) |
| | | else: |
| | | if key not in outputs: |
| | | outputs[key] = [] |
| | | outputs[key].append(sample[key]) |
| | | |
| | | for key, data_list in outputs.items(): |
| | | if isinstance(data_list[0], torch.Tensor): |
| | | if data_list[0].dtype == torch.int64: |
| | | pad_value = self.int_pad_value |
| | | else: |
| | | pad_value = self.float_pad_value |
| | | outputs[key] = torch.nn.utils.rnn.pad_sequence(data_list, batch_first=True, padding_value=pad_value) |
| | | |
| | | hotword_list, hotword_lengths = [], [] |
| | | text = outputs['text'] |
| | | seaco_label_pad = torch.ones_like(text) * -1 if seaco_id else None |
| | | for b, (hotword_indx, one_text, length) in enumerate(zip(hotword_indxs, text, outputs['text_lengths'])): |
| | | length = length[0] |
| | | if seaco_label_pad is not None: seaco_label_pad[b][:length] = seaco_id |
| | | if hotword_indx[0] != -1: |
| | | start, end = int(hotword_indx[0]), int(hotword_indx[1]) |
| | | hotword = one_text[start: end+1] |
| | | hotword_list.append(hotword) |
| | | hotword_lengths.append(end-start+1) |
| | | if seaco_label_pad is not None: seaco_label_pad[b][start: end+1] = one_text[start: end+1] |
| | | if len(hotword_indx) == 4 and hotword_indx[2] != -1: |
| | | # the second hotword if exist |
| | | start, end = int(hotword_indx[2]), int(hotword_indx[3]) |
| | | hotword_list.append(one_text[start: end+1]) |
| | | hotword_lengths.append(end-start+1) |
| | | if seaco_label_pad is not None: seaco_label_pad[b][start: end+1] = one_text[start: end+1] |
| | | hotword_list.append(torch.tensor([1])) |
| | | hotword_lengths.append(1) |
| | | hotword_pad = torch.nn.utils.rnn.pad_sequence(hotword_list, |
| | | batch_first=True, |
| | | padding_value=0) |
| | | outputs["hotword_pad"] = hotword_pad |
| | | outputs["hotword_lengths"] = torch.tensor(hotword_lengths, dtype=torch.int32) |
| | | if seaco_label_pad is not None: outputs['seaco_label_pad'] = seaco_label_pad |
| | | return outputs |
| | |
| | | |
| | | hotword_pad = kwargs.get("hotword_pad") |
| | | hotword_lengths = kwargs.get("hotword_lengths") |
| | | dha_pad = kwargs.get("dha_pad") |
| | | # dha_pad = kwargs.get("dha_pad") |
| | | |
| | | # 1. Encoder |
| | | encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) |
| | |
| | | |
| | | hotword_pad = kwargs.get("hotword_pad") |
| | | hotword_lengths = kwargs.get("hotword_lengths") |
| | | dha_pad = kwargs.get("dha_pad") |
| | | seaco_label_pad = kwargs.get("seaco_label_pad") |
| | | |
| | | batch_size = speech.shape[0] |
| | | # for data-parallel |
| | |
| | | ys_lengths, |
| | | hotword_pad, |
| | | hotword_lengths, |
| | | dha_pad, |
| | | seaco_label_pad, |
| | | ) |
| | | if self.train_decoder: |
| | | loss_att, acc_att = self._calc_att_loss( |
| | |
| | | ys_lengths: torch.Tensor, |
| | | hotword_pad: torch.Tensor, |
| | | hotword_lengths: torch.Tensor, |
| | | dha_pad: torch.Tensor, |
| | | seaco_label_pad: torch.Tensor, |
| | | ): |
| | | # predictor forward |
| | | encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to( |
| | |
| | | dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_out, ys_lengths) |
| | | merged = self._merge(cif_attended, dec_attended) |
| | | dha_output = self.hotword_output_layer(merged[:, :-1]) # remove the last token in loss calculation |
| | | loss_att = self.criterion_seaco(dha_output, dha_pad) |
| | | loss_att = self.criterion_seaco(dha_output, seaco_label_pad) |
| | | return loss_att |
| | | |
| | | def _seaco_decode_with_ASF(self, |