From f9fed09e96f43e7eab88378fc444c4987933badb Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 09 十二月 2022 23:57:51 +0800
Subject: [PATCH] Merge pull request #10 from alibaba-damo-academy/dev

---
 egs/aishell/conformer/run.sh |   92 +++++++++++++++++++++++++++++++++-------------
 1 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 16ebc67..d865982 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -10,9 +10,10 @@
 # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
 njob=8
 train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
 
 # general configuration
-feats_dir=".." #feature output dictionary, for large data
+feats_dir="../DATA" #feature output dictionary, for large data
 exp_dir="."
 lang=zh
 dumpdir=dump/fbank
@@ -59,8 +60,10 @@
 
 if ${gpu_inference}; then
     inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
 else
     inference_nj=$njob
+    _ngpu=0
 fi
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
@@ -83,18 +86,18 @@
     echo "stage 1: Feature Generation"
     # compute fbank features
     fbankdir=${feats_dir}/fbank
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --speed_perturb ${speed_perturb} \
+    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
         ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train
     utils/fix_data_feat.sh ${fbankdir}/train
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
         ${feats_dir}/data/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev
     utils/fix_data_feat.sh ${fbankdir}/dev
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj \
+    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
         ${feats_dir}/data/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test
     utils/fix_data_feat.sh ${fbankdir}/test
      
     # compute global cmvn
-    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj \
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
         ${fbankdir}/train ${exp_dir}/exp/make_fbank/train
 
     # apply cmvn 
@@ -112,6 +115,10 @@
     utils/fix_data_feat.sh ${feat_train_dir}
     utils/fix_data_feat.sh ${feat_dev_dir}
     utils/fix_data_feat.sh ${feat_test_dir}
+
+    #generate ark list 
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/train ${feat_train_dir}
+    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/dev ${feat_dev_dir}
 fi
 
 token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
@@ -140,9 +147,10 @@
 # Training Stage
 world_size=$gpu_num  # run on one machine
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Training"
     mkdir -p ${exp_dir}/exp/${model_dir}
     mkdir -p ${exp_dir}/exp/${model_dir}/log
-    INIT_FILE=$exp_dir/ddp_init
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
     if [ -f $INIT_FILE ];then
         rm -f $INIT_FILE
     fi 
@@ -184,25 +192,57 @@
 
 # Testing Stage
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    utils/easy_asr_infer.sh \
-        --lang zh \
-        --datadir ${feats_dir} \
-        --feats_type ${feats_type} \
-        --feats_dim ${feats_dim} \
-        --token_type ${token_type} \
-        --gpu_inference ${gpu_inference} \
-        --inference_config "${inference_config}" \
-        --test_sets "${test_sets}" \
-        --token_list $token_list \
-        --asr_exp ${exp_dir}/${model_dir} \
-        --stage 12 \
-        --stop_stage 12 \
-        --scp $scp \
-        --text text \
-        --inference_nj $inference_nj \
-        --njob $njob \
-        --inference_asr_model $inference_asr_model \
-        --gpuid_list $gpuid_list \
-        --mode asr
+    echo "stage 4: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/${dumpdir}/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
+        python utils/proce_text.py ${_data}/text ${_data}/text.proc
+        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
 fi
 

--
Gitblit v1.9.1