zhifu gao
2023-05-18 97a689d65da434345a641a909f13b78e5690c86b
egs/aishell2/paraformerbert/local/extract_embeds.sh
@@ -3,20 +3,16 @@
stage=1
stop_stage=3
bert_model_root="../../huggingface_models"
bert_model_name="bert-base-chinese"
#bert_model_name="chinese-roberta-wwm-ext"
#bert_model_name="mengzi-bert-base"
raw_dataset_path="../DATA"
model_path=${bert_model_root}/${bert_model_name}
nj=64
model_path=${bert_model_name}
. utils/parse_options.sh || exit 1;
nj=100
for data_set in train dev_ios test_ios;do
    scp=$raw_dataset_path/dump/fbank/${data_set}/text
    local_scp_dir_raw=$raw_dataset_path/embeds/$bert_model_name/${data_set}
for data_set in train dev_ios;do
    scp=$raw_dataset_path/data/${data_set}/text
    local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set}
    local_scp_dir=$local_scp_dir_raw/split$nj
    local_records_dir=$local_scp_dir_raw/ark
@@ -31,7 +27,7 @@
    utils/split_scp.pl $scp ${split_scps}
    for num in {0..24};do
    for num in {0..7};do
        tmp=`expr $num \* 4`
        if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -41,20 +37,9 @@
                {
                    beg=0
                    gpu=`expr $beg + $idx`
                    echo $local_scp_dir_raw/log/log.${JOB}
                    python tools/extract_embeds.py $local_scp_dir/text.$JOB.txt ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> $local_scp_dir_raw/log/log.${JOB}
                    echo ${local_scp_dir}/log.${JOB}
                    python utils/extract_embeds.py $local_scp_dir/data.$JOB.text ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> ${local_scp_dir}/log.${JOB}
            } &
            done
            wait
        fi
        if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
            for idx in {1..4}; do
                JOB=`expr $tmp + $idx`
                echo "upload jobid=$JOB"
                {
                    hadoop  fs -put -f ${local_records_dir}/embeds.${JOB}.ark ${odps_des_feature_dir}/embeds.${JOB}.ark
                } &
            done
            wait
        fi
@@ -69,6 +54,8 @@
            cat ${local_records_dir}/embeds.${JOB}.shape || exit 1;
        done > ${local_scp_dir_raw}/embeds.shape
    fi
    cp ${local_scp_dir_raw}/embeds.scp  ${raw_dataset_path}/data/${data_set}/embeds.scp
done
echo "embeds is in: ${local_scp_dir_raw}"