#!/usr/bin/env bash # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail log() { local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } min() { local a b a=$1 for b in "$@"; do if [ "${b}" -le "${a}" ]; then a="${b}" fi done echo "${a}" } SECONDS=0 # General configuration stage=1 # Processes starts from the specified stage. stop_stage=10000 # Processes is stopped at the specified stage. skip_data_prep=true # Skip data preparation stages. skip_train=false # Skip training stages. skip_eval=false # Skip decoding and evaluation stages. skip_upload=true # Skip packing and uploading stages. skip_upload_hf=true # Skip uploading to hugging face stages. cuda_cmd=utils/run.pl decode_cmd=utils/run.pl ngpu=1 # The number of gpus ("0" uses cpu, otherwise use gpu). njob=1 # the number of jobs for each gpu gpuid_list= num_nodes=1 # The number of nodes. nj=32 # The number of parallel jobs. inference_nj=32 # The number of parallel jobs in decoding. gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding datadir="./" dumpdir=dump # Directory to dump features. expdir=exp # Directory to save experiments. python=python # Specify python to execute funasr commands. # Data preparation related local_data_opts= # The options given to local/data.sh. # Speed perturbation related speed_perturb_factors= # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space). # Feature extraction related feats_type=fbank # Feature type (raw or fbank_pitch). feats_dim= audio_format=flac # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw). fs=16k # Sampling rate. min_wav_duration=0.1 # Minimum duration in second. max_wav_duration=20 # Maximum duration in second. # Tokenization related token_type=bpe # Tokenization type (char or bpe). nbpe=30 # The number of BPE vocabulary. bpemode=unigram # Mode of BPE (unigram or bpe). oov="" # Out of vocabulary symbol. blank="" # CTC blank symbol sos_eos="" # sos and eos symbole bpe_input_sentence_size=100000000 # Size of input sentence for BPE. bpe_nlsyms= # non-linguistic symbols list, separated by a comma, for BPE bpe_char_cover=1.0 # character coverage when modeling BPE # Ngram model related use_ngram=false ngram_exp= ngram_num=3 # Language model related use_lm=false # Use language model for ASR decoding. lm_tag= # Suffix to the result dir for language model training. lm_exp= # Specify the directory path for LM experiment. # If this option is specified, lm_tag is ignored. lm_stats_dir= # Specify the directory path for LM statistics. lm_config= # Config for language model training. lm_args= # Arguments for language model training, e.g., "--max_epoch 10". # Note that it will overwrite args in lm config. use_word_lm=false # Whether to use word language model. num_splits_lm=1 # Number of splitting for lm corpus. # shellcheck disable=SC2034 word_vocab_size=10000 # Size of word vocabulary. # ASR model related asr_tag= # Suffix to the result dir for asr model training. asr_exp= # Specify the directory path for ASR experiment. # If this option is specified, asr_tag is ignored. asr_stats_dir= # Specify the directory path for ASR statistics. asr_config= # Config for asr model training. asr_args= # Arguments for asr model training, e.g., "--max_epoch 10". # Note that it will overwrite args in asr config. pretrained_model= # Pretrained model to load ignore_init_mismatch=false # Ignore initial mismatch feats_normalize=global_mvn # Normalizaton layer type. num_splits_asr=1 # Number of splitting for lm corpus. # Upload model related hf_repo= # Decoding related use_k2=false # Whether to use k2 based decoder k2_ctc_decoding=true use_nbest_rescoring=true # use transformer-decoder # and transformer language model for nbest rescoring num_paths=1000 # The 3rd argument of k2.random_paths. nll_batch_size=100 # Affect GPU memory usage when computing nll # during nbest rescoring k2_config=./conf/decode_asr_transformer_with_k2.yaml use_streaming=false # Whether to use streaming decoding use_maskctc=false # Whether to use maskctc decoding batch_size=1 inference_tag= # Suffix to the result dir for decoding. inference_config= # Config for decoding. inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1". # Note that it will overwrite args in inference config. inference_lm=valid.loss.ave.pth # Language model path for decoding. inference_ngram=${ngram_num}gram.bin inference_asr_model=valid.acc.ave.pth # ASR model path for decoding. # e.g. # inference_asr_model=train.loss.best.pth # inference_asr_model=3epoch.pth # inference_asr_model=valid.acc.best.pth # inference_asr_model=valid.loss.ave.pth download_model= # Download a model from Model Zoo and use it for decoding. # [Task dependent] Set the datadir name created by local/data.sh train_set= # Name of training set. valid_set= # Name of validation set used for monitoring/tuning network training. test_sets= # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified. bpe_train_text= # Text file path of bpe training set. lm_train_text= # Text file path of language model training set. lm_dev_text= # Text file path of language model development set. lm_test_text= # Text file path of language model evaluation set. nlsyms_txt=none # Non-linguistic symbol list if existing. cleaner=none # Text cleaner. g2p=none # g2p method (needed if token_type=phn). lang=noinfo # The language type of corpus. score_opts= # The options given to sclite scoring local_score_opts= # The options given to local/score.sh. asr_speech_fold_length=800 # fold_length for speech data during ASR training. asr_text_fold_length=150 # fold_length for text data during ASR training. lm_fold_length=150 # fold_length for LM training. oss_path= token_list= scp= text= mode= help_message=$(cat << EOF Usage: $0 --train-set "" --valid-set "" --test_sets "" Options: # General configuration --stage # Processes starts from the specified stage (default="${stage}"). --stop_stage # Processes is stopped at the specified stage (default="${stop_stage}"). --skip_data_prep # Skip data preparation stages (default="${skip_data_prep}"). --skip_train # Skip training stages (default="${skip_train}"). --skip_eval # Skip decoding and evaluation stages (default="${skip_eval}"). --skip_upload # Skip packing and uploading stages (default="${skip_upload}"). --ngpu # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}"). --num_nodes # The number of nodes (default="${num_nodes}"). --nj # The number of parallel jobs (default="${nj}"). --inference_nj # The number of parallel jobs in decoding (default="${inference_nj}"). --gpu_inference # Whether to perform gpu decoding (default="${gpu_inference}"). --dumpdir # Directory to dump features (default="${dumpdir}"). --expdir # Directory to save experiments (default="${expdir}"). --python # Specify python to execute espnet commands (default="${python}"). # Data preparation related --local_data_opts # The options given to local/data.sh (default="${local_data_opts}"). # Speed perturbation related --speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}"). # Feature extraction related --feats_type # Feature type (raw, fbank_pitch or extracted, default="${feats_type}"). --audio_format # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw, default="${audio_format}"). --fs # Sampling rate (default="${fs}"). --min_wav_duration # Minimum duration in second (default="${min_wav_duration}"). --max_wav_duration # Maximum duration in second (default="${max_wav_duration}"). # Tokenization related --token_type # Tokenization type (char or bpe, default="${token_type}"). --nbpe # The number of BPE vocabulary (default="${nbpe}"). --bpemode # Mode of BPE (unigram or bpe, default="${bpemode}"). --oov # Out of vocabulary symbol (default="${oov}"). --blank # CTC blank symbol (default="${blank}"). --sos_eos # sos and eos symbole (default="${sos_eos}"). --bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}"). --bpe_nlsyms # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}"). --bpe_char_cover # Character coverage when modeling BPE (default="${bpe_char_cover}"). # Language model related --lm_tag # Suffix to the result dir for language model training (default="${lm_tag}"). --lm_exp # Specify the directory path for LM experiment. # If this option is specified, lm_tag is ignored (default="${lm_exp}"). --lm_stats_dir # Specify the directory path for LM statistics (default="${lm_stats_dir}"). --lm_config # Config for language model training (default="${lm_config}"). --lm_args # Arguments for language model training (default="${lm_args}"). # e.g., --lm_args "--max_epoch 10" # Note that it will overwrite args in lm config. --use_word_lm # Whether to use word language model (default="${use_word_lm}"). --word_vocab_size # Size of word vocabulary (default="${word_vocab_size}"). --num_splits_lm # Number of splitting for lm corpus (default="${num_splits_lm}"). # ASR model related --asr_tag # Suffix to the result dir for asr model training (default="${asr_tag}"). --asr_exp # Specify the directory path for ASR experiment. # If this option is specified, asr_tag is ignored (default="${asr_exp}"). --asr_stats_dir # Specify the directory path for ASR statistics (default="${asr_stats_dir}"). --asr_config # Config for asr model training (default="${asr_config}"). --asr_args # Arguments for asr model training (default="${asr_args}"). # e.g., --asr_args "--max_epoch 10" # Note that it will overwrite args in asr config. --pretrained_model= # Pretrained model to load (default="${pretrained_model}"). --ignore_init_mismatch= # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}"). --feats_normalize # Normalizaton layer type (default="${feats_normalize}"). --num_splits_asr # Number of splitting for lm corpus (default="${num_splits_asr}"). # Decoding related --inference_tag # Suffix to the result dir for decoding (default="${inference_tag}"). --inference_config # Config for decoding (default="${inference_config}"). --inference_args # Arguments for decoding (default="${inference_args}"). # e.g., --inference_args "--lm_weight 0.1" # Note that it will overwrite args in inference config. --inference_lm # Language model path for decoding (default="${inference_lm}"). --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}"). --download_model # Download a model from Model Zoo and use it for decoding (default="${download_model}"). --use_streaming # Whether to use streaming decoding (default="${use_streaming}"). --use_maskctc # Whether to use maskctc decoding (default="${use_streaming}"). # [Task dependent] Set the datadir name created by local/data.sh --train_set # Name of training set (required). --valid_set # Name of validation set used for monitoring/tuning network training (required). --test_sets # Names of test sets. # Multiple items (e.g., both dev and eval sets) can be specified (required). --bpe_train_text # Text file path of bpe training set. --lm_train_text # Text file path of language model training set. --lm_dev_text # Text file path of language model development set (default="${lm_dev_text}"). --lm_test_text # Text file path of language model evaluation set (default="${lm_test_text}"). --nlsyms_txt # Non-linguistic symbol list if existing (default="${nlsyms_txt}"). --cleaner # Text cleaner (default="${cleaner}"). --g2p # g2p method (default="${g2p}"). --lang # The language type of corpus (default=${lang}). --score_opts # The options given to sclite scoring (default="{score_opts}"). --local_score_opts # The options given to local/score.sh (default="{local_score_opts}"). --asr_speech_fold_length # fold_length for speech data during ASR training (default="${asr_speech_fold_length}"). --asr_text_fold_length # fold_length for text data during ASR training (default="${asr_text_fold_length}"). --lm_fold_length # fold_length for LM training (default="${lm_fold_length}"). EOF ) log "$0 $*" # Save command line args for logging (they will be lost after utils/parse_options.sh) run_args=$(utils/print_args.py $0 "$@") . utils/parse_options.sh if [ $# -ne 0 ]; then log "${help_message}" log "Error: No positional arguments are required." exit 2 fi # set absolute dump dir path dumpdir=${datadir}/${dumpdir} if [ -z "${inference_tag}" ]; then if [ -n "${inference_config}" ]; then inference_tag="$(basename "${inference_config}" .yaml)" else inference_tag=inference fi if "${use_k2}"; then inference_tag+="_use_k2" inference_tag+="_k2_ctc_decoding_${k2_ctc_decoding}" inference_tag+="_use_nbest_rescoring_${use_nbest_rescoring}" fi fi # ========================== Main stages start from here. ========================== if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then log "Stage 12: Decoding: training_dir=${asr_exp}" if ${gpu_inference}; then _cmd="${cuda_cmd}" _ngpu=1 else _cmd="${decode_cmd}" _ngpu=0 fi _opts= if [ -n "${inference_config}" ]; then _opts+="--config ${inference_config} " fi if "${use_lm}"; then if "${use_word_lm}"; then _opts+="--word_lm_train_config ${lm_exp}/config.yaml " _opts+="--word_lm_file ${lm_exp}/${inference_lm} " else _opts+="--lm_train_config ${lm_exp}/config.yaml " _opts+="--lm_file ${lm_exp}/${inference_lm} " fi fi if "${use_ngram}"; then _opts+="--ngram_file ${ngram_exp}/${inference_ngram}" inference_tag=${inference_tag}.${inference_ngram} fi # 2. Generate run.sh log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script" mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh" if "${use_streaming}"; then asr_inference_tool="funasr.bin.asr_inference_streaming" elif "${use_maskctc}"; then asr_inference_tool="funasr.bin.asr_inference_maskctc" else asr_inference_tool="funasr.bin.asr_inference_launch" fi for dset in ${test_sets}; do if [ $feats_type == "ark_wav" ]; then _data="${dumpdir}/wav/${dset}" else _data="${dumpdir}/$feats_type/${dset}" fi _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" _logdir="${_dir}/logdir" if [ -d ${_dir} ]; then #echo "${_dir} is already exists. if you want to decode again, please delete this dir first." rm -r ${_dir} fi mkdir -p "${_logdir}" _scp=$scp _type=kaldi_ark # 1. Split the key file key_file=${_data}/${_scp} split_scps="" if "${use_k2}"; then # Now only _nj=1 is verified if using k2 _nj=1 else _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)") fi for n in $(seq "${_nj}"); do split_scps+=" ${_logdir}/keys.${n}.scp" done # shellcheck disable=SC2086 utils/split_scp.pl "${key_file}" ${split_scps} # 2. Submit decoding jobs log "Decoding started... log: '${_logdir}/asr_inference.*.log'" # shellcheck disable=SC2086 ${_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ ${python} -m ${asr_inference_tool} \ --batch_size ${batch_size} \ --ngpu "${_ngpu}" \ --njob ${njob} \ --gpuid_list ${gpuid_list} \ --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \ --key_file "${_logdir}"/keys.JOB.scp \ --asr_train_config "${asr_exp}"/config.yaml \ --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ --output_dir "${_logdir}"/output.JOB \ --mode $mode \ ${_opts} ${inference_args} # 3. Concatenates the output files from each jobs for f in token token_int score text; do if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then for i in $(seq "${_nj}"); do cat "${_logdir}/output.${i}/1best_recog/${f}" done | sort -k1 >"${_dir}/${f}" fi done python utils/proce_text.py ${_dir}/text ${_dir}/${text}.proc python utils/proce_text.py ${_data}/text ${_data}/${text}.proc python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt cat ${_dir}/text.cer.txt done fi log "Successfully finished. [elapsed=${SECONDS}s]"