| | |
| | | stage=1 |
| | | stop_stage=3 |
| | | |
| | | bert_model_root="../../huggingface_models" |
| | | bert_model_name="bert-base-chinese" |
| | | #bert_model_name="chinese-roberta-wwm-ext" |
| | | #bert_model_name="mengzi-bert-base" |
| | | raw_dataset_path="../DATA" |
| | | model_path=${bert_model_root}/${bert_model_name} |
| | | nj=64 |
| | | model_path=${bert_model_name} |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | nj=100 |
| | | |
| | | for data_set in train dev_ios test_ios;do |
| | | scp=$raw_dataset_path/dump/fbank/${data_set}/text |
| | | local_scp_dir_raw=$raw_dataset_path/embeds/$bert_model_name/${data_set} |
| | | for data_set in train dev_ios;do |
| | | scp=$raw_dataset_path/data/${data_set}/text |
| | | local_scp_dir_raw=${raw_dataset_path}/data/embeds/${data_set} |
| | | local_scp_dir=$local_scp_dir_raw/split$nj |
| | | local_records_dir=$local_scp_dir_raw/ark |
| | | |
| | |
| | | utils/split_scp.pl $scp ${split_scps} |
| | | |
| | | |
| | | for num in {0..24};do |
| | | for num in {0..7};do |
| | | tmp=`expr $num \* 4` |
| | | |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | |
| | | { |
| | | beg=0 |
| | | gpu=`expr $beg + $idx` |
| | | echo $local_scp_dir_raw/log/log.${JOB} |
| | | python tools/extract_embeds.py $local_scp_dir/text.$JOB.txt ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> $local_scp_dir_raw/log/log.${JOB} |
| | | echo ${local_scp_dir}/log.${JOB} |
| | | python utils/extract_embeds.py $local_scp_dir/data.$JOB.text ${local_records_dir}/embeds.${JOB}.ark ${local_records_dir}/embeds.${JOB}.scp ${local_records_dir}/embeds.${JOB}.shape ${gpu} ${model_path} &> ${local_scp_dir}/log.${JOB} |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | | |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | for idx in {1..4}; do |
| | | JOB=`expr $tmp + $idx` |
| | | echo "upload jobid=$JOB" |
| | | { |
| | | hadoop fs -put -f ${local_records_dir}/embeds.${JOB}.ark ${odps_des_feature_dir}/embeds.${JOB}.ark |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | |
| | | cat ${local_records_dir}/embeds.${JOB}.shape || exit 1; |
| | | done > ${local_scp_dir_raw}/embeds.shape |
| | | fi |
| | | |
| | | cp ${local_scp_dir_raw}/embeds.scp ${raw_dataset_path}/data/${data_set}/embeds.scp |
| | | done |
| | | |
| | | echo "embeds is in: ${local_scp_dir_raw}" |