From 3a4281f4959534b1bf5d01acf0085f4f8e6f2ec8 Mon Sep 17 00:00:00 2001
From: wuhongsheng <664116298@qq.com>
Date: 星期五, 05 七月 2024 00:55:32 +0800
Subject: [PATCH] 优化speakid和语句匹配逻辑,部分解决speakid不从0递增问题 (#1870)

---
 examples/wenetspeech/conformer/run.sh |    7 ++++---
 1 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/wenetspeech/conformer/run.sh b/examples/wenetspeech/conformer/run.sh
index 202ca66..6ae995a 100755
--- a/examples/wenetspeech/conformer/run.sh
+++ b/examples/wenetspeech/conformer/run.sh
@@ -42,8 +42,8 @@
 valid_set=dev
 test_sets="dev test_net test_meeting"
 
-asr_config=conf/conformer_12e_6d_2048_512.yaml
-model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+config=conformer_12e_6d_2048_512.yaml
+model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}"
 
 
 
@@ -76,6 +76,7 @@
     python ../../../funasr/bin/compute_audio_cmvn.py \
     --config-path "${workspace}/conf" \
     --config-name "${config}" \
+    ++scale=0.1 \
     ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \
     ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \
 
@@ -91,7 +92,7 @@
     echo "<blank>" > ${token_list}
     echo "<s>" >> ${token_list}
     echo "</s>" >> ${token_list}
-    utils/text2token.py -s 1 -n 1 --space "" --text_format "jsonl" ${feats_dir}/data/$train_set/audio_datasets.jsonl | cut -f 2- -d" " | tr " " "\n" \
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
         | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
     echo "<unk>" >> ${token_list}
 fi

--
Gitblit v1.9.1