| egs/aishell2/paraformer/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/paraformerbert/conf/train_asr_paraformerbert_conformer_20e_6d_1280_320.yaml | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/paraformerbert/local/extract_embeds.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/paraformerbert/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs/aishell2/transformer/run.sh | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/utils/prepare_data.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
egs/aishell2/paraformer/run.sh
@@ -103,8 +103,6 @@ utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} echo "<unk>" >> ${token_list} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${train_set} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${valid_set} fi # Training Stage egs/aishell2/paraformerbert/conf/train_asr_paraformerbert_conformer_20e_6d_1280_320.yaml
@@ -47,7 +47,7 @@ lsm_weight: 0.1 # label smoothing option length_normalized_loss: false predictor_weight: 1.0 glat_context_p: 0.4 sampling_ratio: 0.4 embeds_id: 3 embed_dims: 768 embeds_loss_weight: 2.0 @@ -89,7 +89,7 @@ - 40 num_time_mask: 2 predictor: cif_predictor_sanm predictor: cif_predictor predictor_conf: idim: 320 threshold: 1.0 egs/aishell2/paraformerbert/local/extract_embeds.sh
@@ -11,7 +11,7 @@ nj=32 for data_set in train dev_ios test;do for data_set in train dev_ios;do scp=$raw_dataset_path/data/${data_set}/text local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set} local_scp_dir=$local_scp_dir_raw/split$nj egs/aishell2/paraformerbert/run.sh
@@ -106,8 +106,6 @@ utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} echo "<unk>" >> ${token_list} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${train_set} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${valid_set} fi # Training Stage egs/aishell2/transformer/run.sh
@@ -103,8 +103,6 @@ utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} echo "<unk>" >> ${token_list} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${train_set} mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${valid_set} fi # Training Stage funasr/utils/prepare_data.py
@@ -189,5 +189,20 @@ else: args.train_data_file = os.path.join(args.data_dir, args.train_set, "data.list") args.valid_data_file = os.path.join(args.data_dir, args.valid_set, "data.list") if args.embed_path is not None: for d in [args.train_set, args.valid_set]: file = os.path.join(args.data_dir, d, "data.list") with open(file) as f: lines = f.readlines() out_file = os.path.join(args.data_dir, d, "data_with_embed.list") with open(out_file, "w") as out_f: for line in lines: parts = line.strip().split() idx = parts[0].split("/")[-2] embed_file = os.path.join(args.embed_path, "embeds", args.valid_set, "ark", "embeds.{}.ark".format(idx)) out_f.write(parts[0] + " " + parts[1] + " " + embed_file + "\n") args.train_data_file = os.path.join(args.data_dir, args.train_set, "data_with_embed.list") args.valid_data_file = os.path.join(args.data_dir, args.valid_set, "data_with_embed.list") if distributed: dist.barrier()