Shi Xian
2024-03-09 8cd896d984d2627efb233795978c9a86ff543133
update hotword finetuning (#1452)

* update hotword finetuning

* update scripts
3个文件已修改
4个文件已添加
367 ■■■■■ 已修改文件
examples/industrial_data_pretraining/contextual_paraformer/finetune.sh 44 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/contextual_paraformer/finetune_from_local.sh 71 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/seaco_paraformer/finetune.sh 44 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/seaco_paraformer/finetune_from_local.sh 71 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/audio_datasets/datasets.py 127 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/contextual_paraformer/model.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/seaco_paraformer/model.py 8 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/contextual_paraformer/finetune.sh
New file
@@ -0,0 +1,44 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)
# method1, finetune from model hub
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json
data_dir="/Users/zhifu/funasr1.0/data/list"
## generate jsonl from wav.scp and text.txt
#python -m funasr.datasets.audio_datasets.scp2jsonl \
#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
#++data_type_list='["source", "target"]' \
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# exp output dir
output_dir="/Users/zhifu/exp"
log_file="${output_dir}/log.txt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
torchrun \
--nnodes 1 \
--nproc_per_node ${gpu_num} \
funasr/bin/train.py \
++model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
++model_revision="v2.0.5" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=20 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
examples/industrial_data_pretraining/contextual_paraformer/finetune_from_local.sh
New file
@@ -0,0 +1,71 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)
# method2, finetune from local model
workspace=`pwd`
echo "current path: ${workspace}" # /xxxx/funasr/examples/industrial_data_pretraining/paraformer
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
git clone https://www.modelscope.cn/iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404.git ${local_path}
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python -m funasr.datasets.audio_datasets.scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python -m funasr.datasets.audio_datasets.scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
tokens="${local_path}/tokens.json"
cmvn_file="${local_path}/am.mvn"
# output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
config_name="config.yaml"
init_param="${local_path}/model.pt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
torchrun \
--nnodes 1 \
--nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${local_path}" \
--config-name "${config_name}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=20 \
++optim_conf.lr=0.0002 \
++train_conf.log_interval=1 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file}
examples/industrial_data_pretraining/seaco_paraformer/finetune.sh
New file
@@ -0,0 +1,44 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)
# method1, finetune from model hub
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json
data_dir="/Users/zhifu/funasr1.0/data/list"
## generate jsonl from wav.scp and text.txt
#python -m funasr.datasets.audio_datasets.scp2jsonl \
#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
#++data_type_list='["source", "target"]' \
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# exp output dir
output_dir="/Users/zhifu/exp"
log_file="${output_dir}/log.txt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
torchrun \
--nnodes 1 \
--nproc_per_node ${gpu_num} \
funasr/bin/train.py \
++model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
++model_revision="v2.0.6" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=20 \
++optim_conf.lr=0.0002 \
++output_dir="${output_dir}" &> ${log_file}
examples/industrial_data_pretraining/seaco_paraformer/finetune_from_local.sh
New file
@@ -0,0 +1,71 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)
# method2, finetune from local model
workspace=`pwd`
echo "current path: ${workspace}" # /xxxx/funasr/examples/industrial_data_pretraining/paraformer
# download model
local_path_root=${workspace}/modelscope_models
mkdir -p ${local_path_root}
local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
git clone https://www.modelscope.cn/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
# which gpu to train or finetune
export CUDA_VISIBLE_DEVICES="0,1"
gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
# data dir, which contains: train.json, val.json
data_dir="../../../data/list"
train_data="${data_dir}/train.jsonl"
val_data="${data_dir}/val.jsonl"
# generate train.jsonl and val.jsonl from wav.scp and text.txt
python -m funasr.datasets.audio_datasets.scp2jsonl \
++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${train_data}"
python -m funasr.datasets.audio_datasets.scp2jsonl \
++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_out="${val_data}"
tokens="${local_path}/tokens.json"
cmvn_file="${local_path}/am.mvn"
# output dir
output_dir="./outputs"
log_file="${output_dir}/log.txt"
config_name="config.yaml"
init_param="${local_path}/model.pt"
mkdir -p ${output_dir}
echo "log_file: ${log_file}"
torchrun \
--nnodes 1 \
--nproc_per_node ${gpu_num} \
../../../funasr/bin/train.py \
--config-path "${local_path}" \
--config-name "${config_name}" \
++train_data_set_list="${train_data}" \
++valid_data_set_list="${val_data}" \
++tokenizer_conf.token_list="${tokens}" \
++frontend_conf.cmvn_file="${cmvn_file}" \
++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
++dataset_conf.num_workers=4 \
++train_conf.max_epoch=20 \
++optim_conf.lr=0.0002 \
++train_conf.log_interval=1 \
++init_param="${init_param}" \
++output_dir="${output_dir}" &> ${log_file}
funasr/datasets/audio_datasets/datasets.py
@@ -1,4 +1,5 @@
import torch
import random
from funasr.register import tables
from funasr.utils.load_utils import extract_fbank, load_audio_text_image_video
@@ -98,3 +99,129 @@
                outputs[key] = torch.nn.utils.rnn.pad_sequence(data_list, batch_first=True, padding_value=pad_value)
        return outputs
@tables.register("dataset_classes", "AudioDatasetHotword")
class AudioDatasetHotword(AudioDataset):
    # for finetuning contextual_paraformer and seaco_paraformer
    def __init__(
        self,
        *args,
        seaco_id: bool = 0,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.seaco_id = seaco_id
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
            data_src = self.preprocessor_speech(data_src, fs=self.fs)
        speech, speech_lengths = extract_fbank(data_src, data_type=self.data_type, frontend=self.frontend, is_final=True) # speech: [b, T, d]
        target = item["target"]
        if self.preprocessor_text:
            target = self.preprocessor_text(target)
        if self.tokenizer:
            ids = self.tokenizer.encode(target)
            text = torch.tensor(ids, dtype=torch.int64)
        else:
            ids = target
            text = ids
        ids_lengths = len(ids)
        text_lengths = torch.tensor([ids_lengths], dtype=torch.int32)
        def generate_index(length,
                           hotword_min_length=2,
                           hotword_max_length=8,
                           sample_rate=0.75,
                           double_rate=0.1,
                           pre_prob=0.0,
                           pre_index=None,
                           pre_hwlist=None):
            if length < hotword_min_length:
                return [-1]
            if random.random() < sample_rate:
                if pre_prob > 0 and random.random() < pre_prob and pre_index is not None:
                    return pre_index
                if length == hotword_min_length:
                    return [0, length-1]
                elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2:
                    # sample two hotwords in a sentence
                    _max_hw_length = min(hotword_max_length, length // 2)
                    # first hotword
                    start1 = random.randint(0, length // 3)
                    end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1)
                    # second hotword
                    start2 = random.randint(end1 + 1, length - hotword_min_length)
                    end2 = random.randint(min(length-1, start2+hotword_min_length-1), min(length-1, start2+hotword_max_length-1))
                    return [start1, end1, start2, end2]
                else:  # single hotword
                    start = random.randint(0, length - hotword_min_length)
                    end = random.randint(min(length-1, start+hotword_min_length-1), min(length-1, start+hotword_max_length-1))
                    return [start, end]
            else:
                return [-1]
        hotword_indx = generate_index(text_lengths[0])
        return {"speech": speech[0, :, :],
                "speech_lengths": speech_lengths,
                "text": text,
                "text_lengths": text_lengths,
                "hotword_indx": hotword_indx,
                "seaco_id": self.seaco_id,
                }
    def collator(self, samples: list=None):
        outputs = {}
        hotword_indxs = []
        seaco_id = samples[0]['seaco_id']
        for sample in samples:
            for key in sample.keys():
                if key == 'seaco_id':
                    continue
                elif key == 'hotword_indx':
                    hotword_indxs.append(sample[key])
                else:
                    if key not in outputs:
                        outputs[key] = []
                    outputs[key].append(sample[key])
        for key, data_list in outputs.items():
            if isinstance(data_list[0], torch.Tensor):
                if data_list[0].dtype == torch.int64:
                    pad_value = self.int_pad_value
                else:
                    pad_value = self.float_pad_value
                outputs[key] = torch.nn.utils.rnn.pad_sequence(data_list, batch_first=True, padding_value=pad_value)
        hotword_list, hotword_lengths = [], []
        text = outputs['text']
        seaco_label_pad = torch.ones_like(text) * -1 if seaco_id else None
        for b, (hotword_indx, one_text, length) in enumerate(zip(hotword_indxs, text, outputs['text_lengths'])):
            length = length[0]
            if seaco_label_pad is not None: seaco_label_pad[b][:length] = seaco_id
            if hotword_indx[0] != -1:
                start, end = int(hotword_indx[0]), int(hotword_indx[1])
                hotword = one_text[start: end+1]
                hotword_list.append(hotword)
                hotword_lengths.append(end-start+1)
                if seaco_label_pad is not None: seaco_label_pad[b][start: end+1] = one_text[start: end+1]
                if len(hotword_indx) == 4 and hotword_indx[2] != -1:
                    # the second hotword if exist
                    start, end = int(hotword_indx[2]), int(hotword_indx[3])
                    hotword_list.append(one_text[start: end+1])
                    hotword_lengths.append(end-start+1)
                    if seaco_label_pad is not None: seaco_label_pad[b][start: end+1] = one_text[start: end+1]
        hotword_list.append(torch.tensor([1]))
        hotword_lengths.append(1)
        hotword_pad = torch.nn.utils.rnn.pad_sequence(hotword_list,
                                                      batch_first=True,
                                                      padding_value=0)
        outputs["hotword_pad"] = hotword_pad
        outputs["hotword_lengths"] = torch.tensor(hotword_lengths, dtype=torch.int32)
        if seaco_label_pad is not None: outputs['seaco_label_pad'] = seaco_label_pad
        return outputs
funasr/models/contextual_paraformer/model.py
@@ -107,7 +107,7 @@
        hotword_pad = kwargs.get("hotword_pad")
        hotword_lengths = kwargs.get("hotword_lengths")
        dha_pad = kwargs.get("dha_pad")
        # dha_pad = kwargs.get("dha_pad")
        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
funasr/models/seaco_paraformer/model.py
@@ -128,7 +128,7 @@
    
        hotword_pad = kwargs.get("hotword_pad")
        hotword_lengths = kwargs.get("hotword_lengths")
        dha_pad = kwargs.get("dha_pad")
        seaco_label_pad = kwargs.get("seaco_label_pad")
        
        batch_size = speech.shape[0]
        # for data-parallel
@@ -148,7 +148,7 @@
                                        ys_lengths, 
                                        hotword_pad, 
                                        hotword_lengths, 
                                        dha_pad,
                                        seaco_label_pad,
                                        )
        if self.train_decoder:
            loss_att, acc_att = self._calc_att_loss(
@@ -185,7 +185,7 @@
            ys_lengths: torch.Tensor,
            hotword_pad: torch.Tensor,
            hotword_lengths: torch.Tensor,
            dha_pad: torch.Tensor,
            seaco_label_pad: torch.Tensor,
    ):  
        # predictor forward
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
@@ -204,7 +204,7 @@
        dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_out, ys_lengths)
        merged = self._merge(cif_attended, dec_attended)
        dha_output = self.hotword_output_layer(merged[:, :-1])  # remove the last token in loss calculation
        loss_att = self.criterion_seaco(dha_output, dha_pad)
        loss_att = self.criterion_seaco(dha_output, seaco_label_pad)
        return loss_att
    def _seaco_decode_with_ASF(self,