From 6f7e27eb7c2d0a7649ec8f14d167c8da8e29f906 Mon Sep 17 00:00:00 2001
From: jmwang66 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期二, 16 五月 2023 15:07:20 +0800
Subject: [PATCH] Merge pull request #518 from alibaba-damo-academy/dev_wjm2

---
 egs/aishell2/paraformerbert/local/extract_embeds.sh |   31 ++++++++-----------------------
 1 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/egs/aishell2/paraformerbert/local/extract_embeds.sh b/egs/aishell2/paraformerbert/local/extract_embeds.sh
index 5f45ff3..ee89924 100755
--- a/egs/aishell2/paraformerbert/local/extract_embeds.sh
+++ b/egs/aishell2/paraformerbert/local/extract_embeds.sh
@@ -3,20 +3,16 @@
 stage=1
 stop_stage=3
 
-bert_model_root="../../huggingface_models"
 bert_model_name="bert-base-chinese"
-#bert_model_name="chinese-roberta-wwm-ext"
-#bert_model_name="mengzi-bert-base"
 raw_dataset_path="../DATA"
-model_path=${bert_model_root}/${bert_model_name}
+nj=64
+model_path=${bert_model_name}
 
 . utils/parse_options.sh || exit 1;
 
-nj=100
-
-for data_set in train dev_ios test_ios;do
-    scp=$raw_dataset_path/dump/fbank/${data_set}/text
-    local_scp_dir_raw=$raw_dataset_path/embeds/$bert_model_name/${data_set}
+for data_set in train dev_ios;do
+    scp=$raw_dataset_path/data/${data_set}/text
+    local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set}
     local_scp_dir=$local_scp_dir_raw/split$nj
     local_records_dir=$local_scp_dir_raw/ark
 
@@ -31,7 +27,7 @@
     utils/split_scp.pl $scp ${split_scps}
 
 
-    for num in {0..24};do
+    for num in {0..7};do
         tmp=`expr $num \* 4`
 
         if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -41,20 +37,9 @@
                 {
                     beg=0
                     gpu=`expr $beg + $idx`
-                    echo $local_scp_dir_raw/log/log.${JOB}
-                    python tools/extract_embeds.py $local_scp_dir/text.$JOB.txt ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> $local_scp_dir_raw/log/log.${JOB}
+                    echo ${local_scp_dir}/log.${JOB}
+                    python utils/extract_embeds.py $local_scp_dir/data.$JOB.text ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> ${local_scp_dir}/log.${JOB}
             } &
-            done
-            wait
-        fi
-
-        if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-            for idx in {1..4}; do
-                JOB=`expr $tmp + $idx`
-                echo "upload jobid=$JOB"
-                {
-                    hadoop  fs -put -f ${local_records_dir}/embeds.${JOB}.ark ${odps_des_feature_dir}/embeds.${JOB}.ark
-                } &
             done
             wait
         fi

--
Gitblit v1.9.1