| | |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary |
| | | exp_dir="." |
| | | exp_dir=`pwd` |
| | | lang=zh |
| | | token_type=char |
| | | stage=0 |
| | |
| | | # feature configuration |
| | | nj=32 |
| | | |
| | | inference_device="cuda" #"cpu" |
| | | inference_checkpoint="model.pt" |
| | | inference_device="cuda" #"cpu", "cuda:0", "cuda:1" |
| | | inference_checkpoint="model.pt.avg10" |
| | | inference_scp="wav.scp" |
| | | inference_batch_size=32 |
| | | inference_batch_size=1 |
| | | |
| | | # data |
| | | raw_data=../raw_data |
| | |
| | | # exp tag |
| | | tag="exp1" |
| | | workspace=`pwd` |
| | | |
| | | master_port=12345 |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | |
| | | --config-name "${config}" \ |
| | | ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ |
| | | ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \ |
| | | ++dataset_conf.num_workers=$nj |
| | | |
| | | fi |
| | | |
| | | token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt |
| | |
| | | echo "stage 4: ASR Training" |
| | | |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt" |
| | | current_time=$(date "+%Y-%m-%d_%H-%M") |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" |
| | | echo "log_file: ${log_file}" |
| | | |
| | | export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | torchrun \ |
| | | --nnodes 1 \ |
| | | --nproc_per_node ${gpu_num} \ |
| | | --master_port ${master_port} \ |
| | | ../../../funasr/bin/train.py \ |
| | | --config-path "${workspace}/conf" \ |
| | | --config-name "${config}" \ |
| | |
| | | if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then |
| | | echo "stage 5: Inference" |
| | | |
| | | if ${inference_device} == "cuda"; then |
| | | if [ ${inference_device} == "cuda" ]; then |
| | | nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | | else |
| | | inference_batch_size=1 |
| | |
| | | |
| | | for dset in ${test_sets}; do |
| | | |
| | | inference_dir="${exp_dir}/exp/${model_dir}/${inference_checkpoint}/${dset}" |
| | | inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}" |
| | | _logdir="${inference_dir}/logdir" |
| | | echo "inference_dir: ${inference_dir}" |
| | | |
| | | mkdir -p "${_logdir}" |
| | | data_dir="${feats_dir}/data/${dset}" |
| | |
| | | done |
| | | utils/split_scp.pl "${key_file}" ${split_scps} |
| | | |
| | | gpuid_list_array=(${gpuid_list//,/ }) |
| | | gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | id=$((JOB-1)) |
| | |
| | | ++input="${_logdir}/keys.${JOB}.scp" \ |
| | | ++output_dir="${inference_dir}/${JOB}" \ |
| | | ++device="${inference_device}" \ |
| | | ++batch_size="${inference_batch_size}" |
| | | ++ncpu=1 \ |
| | | ++disable_log=true \ |
| | | ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt |
| | | }& |
| | | |
| | | done |
| | |
| | | done |
| | | |
| | | echo "Computing WER ..." |
| | | cp ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc |
| | | cp ${data_dir}/text ${inference_dir}/1best_recog/text.ref |
| | | python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc |
| | | python utils/postprocess_text_zh.py ${data_dir}/text ${inference_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer |
| | | tail -n 3 ${inference_dir}/1best_recog/text.cer |
| | | done |