#!/usr/bin/env bash
|
|
# Set bash to 'debug' mode, it will exit on :
|
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
|
set -e
|
set -u
|
set -o pipefail
|
|
log() {
|
local fname=${BASH_SOURCE[1]##*/}
|
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
}
|
min() {
|
local a b
|
a=$1
|
for b in "$@"; do
|
if [ "${b}" -le "${a}" ]; then
|
a="${b}"
|
fi
|
done
|
echo "${a}"
|
}
|
SECONDS=0
|
|
# General configuration
|
stage=1 # Processes starts from the specified stage.
|
stop_stage=10000 # Processes is stopped at the specified stage.
|
skip_data_prep=true # Skip data preparation stages.
|
skip_train=false # Skip training stages.
|
skip_eval=false # Skip decoding and evaluation stages.
|
skip_upload=true # Skip packing and uploading stages.
|
skip_upload_hf=true # Skip uploading to hugging face stages.
|
cuda_cmd=utils/run.pl
|
decode_cmd=utils/run.pl
|
ngpu=1 # The number of gpus ("0" uses cpu, otherwise use gpu).
|
njob=1 # the number of jobs for each gpu
|
gpuid_list=
|
num_nodes=1 # The number of nodes.
|
nj=32 # The number of parallel jobs.
|
inference_nj=32 # The number of parallel jobs in decoding.
|
gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
|
datadir="./"
|
dumpdir=dump # Directory to dump features.
|
expdir=exp # Directory to save experiments.
|
python=python # Specify python to execute funasr commands.
|
|
# Data preparation related
|
local_data_opts= # The options given to local/data.sh.
|
|
# Speed perturbation related
|
speed_perturb_factors= # perturbation factors, e.g. "0.9 1.0 1.1" (separated by space).
|
|
# Feature extraction related
|
feats_type=fbank # Feature type (raw or fbank_pitch).
|
feats_dim=
|
audio_format=flac # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw).
|
fs=16k # Sampling rate.
|
min_wav_duration=0.1 # Minimum duration in second.
|
max_wav_duration=20 # Maximum duration in second.
|
|
# Tokenization related
|
token_type=bpe # Tokenization type (char or bpe).
|
nbpe=30 # The number of BPE vocabulary.
|
bpemode=unigram # Mode of BPE (unigram or bpe).
|
oov="<unk>" # Out of vocabulary symbol.
|
blank="<blank>" # CTC blank symbol
|
sos_eos="<sos/eos>" # sos and eos symbole
|
bpe_input_sentence_size=100000000 # Size of input sentence for BPE.
|
bpe_nlsyms= # non-linguistic symbols list, separated by a comma, for BPE
|
bpe_char_cover=1.0 # character coverage when modeling BPE
|
|
# Ngram model related
|
use_ngram=false
|
ngram_exp=
|
ngram_num=3
|
|
# Language model related
|
use_lm=false # Use language model for ASR decoding.
|
lm_tag= # Suffix to the result dir for language model training.
|
lm_exp= # Specify the directory path for LM experiment.
|
# If this option is specified, lm_tag is ignored.
|
lm_stats_dir= # Specify the directory path for LM statistics.
|
lm_config= # Config for language model training.
|
lm_args= # Arguments for language model training, e.g., "--max_epoch 10".
|
# Note that it will overwrite args in lm config.
|
use_word_lm=false # Whether to use word language model.
|
num_splits_lm=1 # Number of splitting for lm corpus.
|
# shellcheck disable=SC2034
|
word_vocab_size=10000 # Size of word vocabulary.
|
|
# ASR model related
|
asr_tag= # Suffix to the result dir for asr model training.
|
asr_exp= # Specify the directory path for ASR experiment.
|
# If this option is specified, asr_tag is ignored.
|
asr_stats_dir= # Specify the directory path for ASR statistics.
|
asr_config= # Config for asr model training.
|
asr_args= # Arguments for asr model training, e.g., "--max_epoch 10".
|
# Note that it will overwrite args in asr config.
|
pretrained_model= # Pretrained model to load
|
ignore_init_mismatch=false # Ignore initial mismatch
|
feats_normalize=global_mvn # Normalizaton layer type.
|
num_splits_asr=1 # Number of splitting for lm corpus.
|
|
# Upload model related
|
hf_repo=
|
|
# Decoding related
|
use_k2=false # Whether to use k2 based decoder
|
k2_ctc_decoding=true
|
use_nbest_rescoring=true # use transformer-decoder
|
# and transformer language model for nbest rescoring
|
num_paths=1000 # The 3rd argument of k2.random_paths.
|
nll_batch_size=100 # Affect GPU memory usage when computing nll
|
# during nbest rescoring
|
k2_config=./conf/decode_asr_transformer_with_k2.yaml
|
|
use_streaming=false # Whether to use streaming decoding
|
|
use_maskctc=false # Whether to use maskctc decoding
|
|
batch_size=1
|
inference_tag= # Suffix to the result dir for decoding.
|
inference_config= # Config for decoding.
|
inference_args= # Arguments for decoding, e.g., "--lm_weight 0.1".
|
# Note that it will overwrite args in inference config.
|
inference_lm=valid.loss.ave.pth # Language model path for decoding.
|
inference_ngram=${ngram_num}gram.bin
|
inference_asr_model=valid.acc.ave.pth # ASR model path for decoding.
|
# e.g.
|
# inference_asr_model=train.loss.best.pth
|
# inference_asr_model=3epoch.pth
|
# inference_asr_model=valid.acc.best.pth
|
# inference_asr_model=valid.loss.ave.pth
|
download_model= # Download a model from Model Zoo and use it for decoding.
|
|
# [Task dependent] Set the datadir name created by local/data.sh
|
train_set= # Name of training set.
|
valid_set= # Name of validation set used for monitoring/tuning network training.
|
test_sets= # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
|
bpe_train_text= # Text file path of bpe training set.
|
lm_train_text= # Text file path of language model training set.
|
lm_dev_text= # Text file path of language model development set.
|
lm_test_text= # Text file path of language model evaluation set.
|
nlsyms_txt=none # Non-linguistic symbol list if existing.
|
cleaner=none # Text cleaner.
|
g2p=none # g2p method (needed if token_type=phn).
|
lang=noinfo # The language type of corpus.
|
score_opts= # The options given to sclite scoring
|
local_score_opts= # The options given to local/score.sh.
|
asr_speech_fold_length=800 # fold_length for speech data during ASR training.
|
asr_text_fold_length=150 # fold_length for text data during ASR training.
|
lm_fold_length=150 # fold_length for LM training.
|
|
oss_path=
|
token_list=
|
scp=
|
text=
|
|
mode=
|
|
help_message=$(cat << EOF
|
Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>"
|
|
Options:
|
# General configuration
|
--stage # Processes starts from the specified stage (default="${stage}").
|
--stop_stage # Processes is stopped at the specified stage (default="${stop_stage}").
|
--skip_data_prep # Skip data preparation stages (default="${skip_data_prep}").
|
--skip_train # Skip training stages (default="${skip_train}").
|
--skip_eval # Skip decoding and evaluation stages (default="${skip_eval}").
|
--skip_upload # Skip packing and uploading stages (default="${skip_upload}").
|
--ngpu # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
|
--num_nodes # The number of nodes (default="${num_nodes}").
|
--nj # The number of parallel jobs (default="${nj}").
|
--inference_nj # The number of parallel jobs in decoding (default="${inference_nj}").
|
--gpu_inference # Whether to perform gpu decoding (default="${gpu_inference}").
|
--dumpdir # Directory to dump features (default="${dumpdir}").
|
--expdir # Directory to save experiments (default="${expdir}").
|
--python # Specify python to execute espnet commands (default="${python}").
|
|
# Data preparation related
|
--local_data_opts # The options given to local/data.sh (default="${local_data_opts}").
|
|
# Speed perturbation related
|
--speed_perturb_factors # speed perturbation factors, e.g. "0.9 1.0 1.1" (separated by space, default="${speed_perturb_factors}").
|
|
# Feature extraction related
|
--feats_type # Feature type (raw, fbank_pitch or extracted, default="${feats_type}").
|
--audio_format # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw, default="${audio_format}").
|
--fs # Sampling rate (default="${fs}").
|
--min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
|
--max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
|
|
# Tokenization related
|
--token_type # Tokenization type (char or bpe, default="${token_type}").
|
--nbpe # The number of BPE vocabulary (default="${nbpe}").
|
--bpemode # Mode of BPE (unigram or bpe, default="${bpemode}").
|
--oov # Out of vocabulary symbol (default="${oov}").
|
--blank # CTC blank symbol (default="${blank}").
|
--sos_eos # sos and eos symbole (default="${sos_eos}").
|
--bpe_input_sentence_size # Size of input sentence for BPE (default="${bpe_input_sentence_size}").
|
--bpe_nlsyms # Non-linguistic symbol list for sentencepiece, separated by a comma. (default="${bpe_nlsyms}").
|
--bpe_char_cover # Character coverage when modeling BPE (default="${bpe_char_cover}").
|
|
# Language model related
|
--lm_tag # Suffix to the result dir for language model training (default="${lm_tag}").
|
--lm_exp # Specify the directory path for LM experiment.
|
# If this option is specified, lm_tag is ignored (default="${lm_exp}").
|
--lm_stats_dir # Specify the directory path for LM statistics (default="${lm_stats_dir}").
|
--lm_config # Config for language model training (default="${lm_config}").
|
--lm_args # Arguments for language model training (default="${lm_args}").
|
# e.g., --lm_args "--max_epoch 10"
|
# Note that it will overwrite args in lm config.
|
--use_word_lm # Whether to use word language model (default="${use_word_lm}").
|
--word_vocab_size # Size of word vocabulary (default="${word_vocab_size}").
|
--num_splits_lm # Number of splitting for lm corpus (default="${num_splits_lm}").
|
|
# ASR model related
|
--asr_tag # Suffix to the result dir for asr model training (default="${asr_tag}").
|
--asr_exp # Specify the directory path for ASR experiment.
|
# If this option is specified, asr_tag is ignored (default="${asr_exp}").
|
--asr_stats_dir # Specify the directory path for ASR statistics (default="${asr_stats_dir}").
|
--asr_config # Config for asr model training (default="${asr_config}").
|
--asr_args # Arguments for asr model training (default="${asr_args}").
|
# e.g., --asr_args "--max_epoch 10"
|
# Note that it will overwrite args in asr config.
|
--pretrained_model= # Pretrained model to load (default="${pretrained_model}").
|
--ignore_init_mismatch= # Ignore mismatch parameter init with pretrained model (default="${ignore_init_mismatch}").
|
--feats_normalize # Normalizaton layer type (default="${feats_normalize}").
|
--num_splits_asr # Number of splitting for lm corpus (default="${num_splits_asr}").
|
|
# Decoding related
|
--inference_tag # Suffix to the result dir for decoding (default="${inference_tag}").
|
--inference_config # Config for decoding (default="${inference_config}").
|
--inference_args # Arguments for decoding (default="${inference_args}").
|
# e.g., --inference_args "--lm_weight 0.1"
|
# Note that it will overwrite args in inference config.
|
--inference_lm # Language model path for decoding (default="${inference_lm}").
|
--inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
|
--download_model # Download a model from Model Zoo and use it for decoding (default="${download_model}").
|
--use_streaming # Whether to use streaming decoding (default="${use_streaming}").
|
--use_maskctc # Whether to use maskctc decoding (default="${use_streaming}").
|
|
# [Task dependent] Set the datadir name created by local/data.sh
|
--train_set # Name of training set (required).
|
--valid_set # Name of validation set used for monitoring/tuning network training (required).
|
--test_sets # Names of test sets.
|
# Multiple items (e.g., both dev and eval sets) can be specified (required).
|
--bpe_train_text # Text file path of bpe training set.
|
--lm_train_text # Text file path of language model training set.
|
--lm_dev_text # Text file path of language model development set (default="${lm_dev_text}").
|
--lm_test_text # Text file path of language model evaluation set (default="${lm_test_text}").
|
--nlsyms_txt # Non-linguistic symbol list if existing (default="${nlsyms_txt}").
|
--cleaner # Text cleaner (default="${cleaner}").
|
--g2p # g2p method (default="${g2p}").
|
--lang # The language type of corpus (default=${lang}).
|
--score_opts # The options given to sclite scoring (default="{score_opts}").
|
--local_score_opts # The options given to local/score.sh (default="{local_score_opts}").
|
--asr_speech_fold_length # fold_length for speech data during ASR training (default="${asr_speech_fold_length}").
|
--asr_text_fold_length # fold_length for text data during ASR training (default="${asr_text_fold_length}").
|
--lm_fold_length # fold_length for LM training (default="${lm_fold_length}").
|
EOF
|
)
|
|
log "$0 $*"
|
# Save command line args for logging (they will be lost after utils/parse_options.sh)
|
run_args=$(utils/print_args.py $0 "$@")
|
. utils/parse_options.sh
|
|
if [ $# -ne 0 ]; then
|
log "${help_message}"
|
log "Error: No positional arguments are required."
|
exit 2
|
fi
|
|
# set absolute dump dir path
|
dumpdir=${datadir}/${dumpdir}
|
|
if [ -z "${inference_tag}" ]; then
|
if [ -n "${inference_config}" ]; then
|
inference_tag="$(basename "${inference_config}" .yaml)"
|
else
|
inference_tag=inference
|
fi
|
|
if "${use_k2}"; then
|
inference_tag+="_use_k2"
|
inference_tag+="_k2_ctc_decoding_${k2_ctc_decoding}"
|
inference_tag+="_use_nbest_rescoring_${use_nbest_rescoring}"
|
fi
|
fi
|
|
# ========================== Main stages start from here. ==========================
|
|
if [ ${stage} -le 12 ] && [ ${stop_stage} -ge 12 ]; then
|
log "Stage 12: Decoding: training_dir=${asr_exp}"
|
|
if ${gpu_inference}; then
|
_cmd="${cuda_cmd}"
|
_ngpu=1
|
else
|
_cmd="${decode_cmd}"
|
_ngpu=0
|
fi
|
|
_opts=
|
if [ -n "${inference_config}" ]; then
|
_opts+="--config ${inference_config} "
|
fi
|
|
if "${use_lm}"; then
|
if "${use_word_lm}"; then
|
_opts+="--word_lm_train_config ${lm_exp}/config.yaml "
|
_opts+="--word_lm_file ${lm_exp}/${inference_lm} "
|
else
|
_opts+="--lm_train_config ${lm_exp}/config.yaml "
|
_opts+="--lm_file ${lm_exp}/${inference_lm} "
|
fi
|
fi
|
|
if "${use_ngram}"; then
|
_opts+="--ngram_file ${ngram_exp}/${inference_ngram}"
|
inference_tag=${inference_tag}.${inference_ngram}
|
fi
|
|
# 2. Generate run.sh
|
log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 12 using this script"
|
mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 12 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
|
|
if "${use_streaming}"; then
|
asr_inference_tool="funasr.bin.asr_inference_streaming"
|
elif "${use_maskctc}"; then
|
asr_inference_tool="funasr.bin.asr_inference_maskctc"
|
else
|
asr_inference_tool="funasr.bin.asr_inference_launch"
|
fi
|
|
for dset in ${test_sets}; do
|
if [ $feats_type == "ark_wav" ]; then
|
_data="${dumpdir}/wav/${dset}"
|
else
|
_data="${dumpdir}/$feats_type/${dset}"
|
fi
|
_dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
|
_logdir="${_dir}/logdir"
|
|
if [ -d ${_dir} ]; then
|
#echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
|
rm -r ${_dir}
|
fi
|
mkdir -p "${_logdir}"
|
|
_scp=$scp
|
_type=kaldi_ark
|
|
|
# 1. Split the key file
|
key_file=${_data}/${_scp}
|
split_scps=""
|
if "${use_k2}"; then
|
# Now only _nj=1 is verified if using k2
|
_nj=1
|
else
|
_nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
|
fi
|
|
for n in $(seq "${_nj}"); do
|
split_scps+=" ${_logdir}/keys.${n}.scp"
|
done
|
# shellcheck disable=SC2086
|
utils/split_scp.pl "${key_file}" ${split_scps}
|
|
# 2. Submit decoding jobs
|
log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
|
# shellcheck disable=SC2086
|
${_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
|
${python} -m ${asr_inference_tool} \
|
--batch_size ${batch_size} \
|
--ngpu "${_ngpu}" \
|
--njob ${njob} \
|
--gpuid_list ${gpuid_list} \
|
--data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
|
--key_file "${_logdir}"/keys.JOB.scp \
|
--asr_train_config "${asr_exp}"/config.yaml \
|
--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
|
--output_dir "${_logdir}"/output.JOB \
|
--mode $mode \
|
${_opts} ${inference_args}
|
|
# 3. Concatenates the output files from each jobs
|
for f in token token_int score text; do
|
if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
|
for i in $(seq "${_nj}"); do
|
cat "${_logdir}/output.${i}/1best_recog/${f}"
|
done | sort -k1 >"${_dir}/${f}"
|
fi
|
done
|
python utils/proce_text.py ${_dir}/text ${_dir}/${text}.proc
|
python utils/proce_text.py ${_data}/text ${_data}/${text}.proc
|
python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
|
tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
|
cat ${_dir}/text.cer.txt
|
done
|
fi
|
|
log "Successfully finished. [elapsed=${SECONDS}s]"
|