#!/usr/bin/env bash . ./path.sh || exit 1; # machines configuration CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" gpu_num=8 count=1 gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob njob=5 train_cmd=utils/run.pl infer_cmd=utils/run.pl # general configuration feats_dir="../DATA" #feature output dictionary exp_dir="." lang=zh token_type=char type=sound scp=wav.scp speed_perturb="0.9 1.0 1.1" stage=0 stop_stage=5 # feature configuration feats_dim=80 nj=64 # data raw_data=/nfs/zhifu.gzf/wenetspeech_proc # exp tag tag="exp1" . utils/parse_options.sh || exit 1; # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail set=L train_set=train_l valid_set=dev test_sets="dev test_net test_meeting" asr_config=conf/train_asr_conformer.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" inference_config=conf/decode_asr_transformer_5beam.yaml inference_asr_model=valid.acc.ave_10best.pb # you can set gpu num for decoding here gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') if ${gpu_inference}; then inference_nj=$[${ngpu}*${njob}] _ngpu=1 else inference_nj=$njob _ngpu=0 fi if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then echo "For downloading data, please refer to https://github.com/wenet-e2e/WenetSpeech." exit 0; fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "stage 0: Data preparation" # Data preparation # local/data.sh --set ${set} --nj $nj --data_dir $feats_dir --WENETSPEECH $raw_data --train_cmd $train_cmd mkdir $feats_dir/data mv $feats_dir/$train_set $feats_dir/data/$train_set for x in $test_sets; do mv $feats_dir/$x $feats_dir/data/ done fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "stage 1: Feature and CMVN Generation" utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 fi token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt echo "dictionary: ${token_list}" if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Dictionary Preparation" mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/ echo "make a dictionary" echo "" > ${token_list} echo "" >> ${token_list} echo "" >> ${token_list} utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \ | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} echo "" >> ${token_list} fi