Merge pull request #79 from alibaba-damo-academy/dev_wjm
Dev wjm
7个文件已修改
15个文件已添加
3 文件已复制
7 文件已重命名
| New file |
| | |
| | | beam_size: 1 |
| | | penalty: 0.0 |
| | | maxlenratio: 0.0 |
| | | minlenratio: 0.0 |
| | | ctc_weight: 0.0 |
| | | lm_weight: 0.15 |
| New file |
| | |
| | | # network architecture |
| | | # encoder related |
| | | encoder: data2vec_encoder |
| | | encoder_conf: |
| | | extractor_mode: layer_norm |
| | | encoder_layerdrop: 0.1 |
| | | dropout_input: 0.0 |
| | | dropout_features: 0.0 |
| | | feature_grad_mult: 0.0 |
| | | encoder_embed_dim: 768 |
| | | |
| | | mask_prob: 0.65 |
| | | mask_length: 10 |
| | | |
| | | loss_beta: 0 |
| | | loss_scale: null |
| | | |
| | | instance_norm_target_layer: true |
| | | average_top_k_layers: 8 |
| | | |
| | | pos_conv_depth: 5 |
| | | conv_pos: 95 |
| | | |
| | | ema_decay: 0.999 |
| | | ema_end_decay: 0.9999 |
| | | ema_anneal_end_step: 30000 |
| | | ema_transformer_only: true |
| | | ema_layers_only: true |
| | | |
| | | require_same_masks: true |
| | | mask_dropout: 0 |
| | | |
| | | |
| | | # decoder related |
| | | decoder: paraformer_decoder_san |
| | | decoder_conf: |
| | | attention_heads: 12 |
| | | linear_units: 3072 |
| | | num_blocks: 6 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | self_attention_dropout_rate: 0.0 |
| | | src_attention_dropout_rate: 0.0 |
| | | |
| | | model: paraformer |
| | | model_conf: |
| | | ctc_weight: 0.3 |
| | | lsm_weight: 0.1 |
| | | length_normalized_loss: false |
| | | predictor_weight: 1.0 |
| | | sampling_ratio: 0.4 |
| | | |
| | | # minibatch related |
| | | batch_type: length |
| | | batch_bins: 25000 |
| | | num_workers: 16 |
| | | |
| | | # optimization related |
| | | accum_grad: 1 |
| | | grad_clip: 5 |
| | | max_epoch: 50 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - acc |
| | | best_model_criterion: |
| | | - - valid |
| | | - acc |
| | | - max |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.00002 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 30000 |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | | apply_time_warp: true |
| | | time_warp_window: 5 |
| | | time_warp_mode: bicubic |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 30 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_range: |
| | | - 0 |
| | | - 40 |
| | | num_time_mask: 2 |
| | | |
| | | predictor: cif_predictor |
| | | predictor_conf: |
| | | idim: 768 |
| | | threshold: 1.0 |
| | | l_order: 1 |
| | | r_order: 1 |
| | | |
| | | |
| | | log_interval: 50 |
| | | unused_parameters: true |
| | | normalize: None |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | # machines configuration |
| | | CUDA_VISIBLE_DEVICES="0,1" |
| | | gpu_num=2 |
| | | count=1 |
| | | gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding |
| | | # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob |
| | | njob=5 |
| | | train_cmd=utils/run.pl |
| | | infer_cmd=utils/run.pl |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary, for large data |
| | | exp_dir="." |
| | | lang=zh |
| | | dumpdir=dump/fbank |
| | | feats_type=fbank |
| | | token_type=char |
| | | scp=feats.scp |
| | | type=kaldi_ark |
| | | stage=0 |
| | | stop_stage=4 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | sample_frequency=16000 |
| | | nj=32 |
| | | speed_perturb="0.9,1.0,1.1" |
| | | |
| | | # data |
| | | data_aishell= |
| | | |
| | | # exp tag |
| | | tag="" |
| | | |
| | | model_name=damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch |
| | | init_param="$HOME/.cache/modelscope/hub/$model_name/basemodel.pb" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | train_set=train |
| | | valid_set=dev |
| | | test_sets="dev test" |
| | | |
| | | asr_config=conf/train_asr_paraformer_transformer_12e_6d_3072_768.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" |
| | | |
| | | inference_config=conf/decode_asr_transformer_noctc_1best.yaml |
| | | inference_asr_model=valid.acc.ave_10best.pth |
| | | |
| | | # you can set gpu num for decoding here |
| | | gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default |
| | | ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | |
| | | if ${gpu_inference}; then |
| | | inference_nj=$[${ngpu}*${njob}] |
| | | _ngpu=1 |
| | | else |
| | | inference_nj=$njob |
| | | _ngpu=0 |
| | | fi |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | | # Data preparation |
| | | local/aishell_data_prep.sh ${data_aishell}/data_aishell/wav ${data_aishell}/data_aishell/transcript ${feats_dir} |
| | | for x in train dev test; do |
| | | cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org |
| | | paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \ |
| | | > ${feats_dir}/data/${x}/text |
| | | utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org |
| | | mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text |
| | | done |
| | | fi |
| | | |
| | | feat_train_dir=${feats_dir}/${dumpdir}/train; mkdir -p ${feat_train_dir} |
| | | feat_dev_dir=${feats_dir}/${dumpdir}/dev; mkdir -p ${feat_dev_dir} |
| | | feat_test_dir=${feats_dir}/${dumpdir}/test; mkdir -p ${feat_test_dir} |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | echo "stage 1: Feature Generation" |
| | | # compute fbank features |
| | | fbankdir=${feats_dir}/fbank |
| | | utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \ |
| | | ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train |
| | | utils/fix_data_feat.sh ${fbankdir}/train |
| | | utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \ |
| | | ${feats_dir}/data/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev |
| | | utils/fix_data_feat.sh ${fbankdir}/dev |
| | | utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \ |
| | | ${feats_dir}/data/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test |
| | | utils/fix_data_feat.sh ${fbankdir}/test |
| | | |
| | | # compute global cmvn |
| | | utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \ |
| | | ${fbankdir}/train ${exp_dir}/exp/make_fbank/train |
| | | |
| | | # apply cmvn |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/train ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/train ${feat_train_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/dev ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/test ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/test ${feat_test_dir} |
| | | |
| | | cp ${fbankdir}/train/text ${fbankdir}/train/speech_shape ${fbankdir}/train/text_shape ${feat_train_dir} |
| | | cp ${fbankdir}/dev/text ${fbankdir}/dev/speech_shape ${fbankdir}/dev/text_shape ${feat_dev_dir} |
| | | cp ${fbankdir}/test/text ${fbankdir}/test/speech_shape ${fbankdir}/test/text_shape ${feat_test_dir} |
| | | |
| | | utils/fix_data_feat.sh ${feat_train_dir} |
| | | utils/fix_data_feat.sh ${feat_dev_dir} |
| | | utils/fix_data_feat.sh ${feat_test_dir} |
| | | |
| | | #generate ark list |
| | | utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/train ${feat_train_dir} |
| | | utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/dev ${feat_dev_dir} |
| | | fi |
| | | |
| | | token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt |
| | | echo "dictionary: ${token_list}" |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | echo "stage 2: Dictionary Preparation" |
| | | mkdir -p ${feats_dir}/data/${lang}_token_list/char/ |
| | | |
| | | echo "make a dictionary" |
| | | echo "<blank>" > ${token_list} |
| | | echo "<s>" >> ${token_list} |
| | | echo "</s>" >> ${token_list} |
| | | utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/train/text | cut -f 2- -d" " | tr " " "\n" \ |
| | | | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} |
| | | num_token=$(cat ${token_list} | wc -l) |
| | | echo "<unk>" >> ${token_list} |
| | | vocab_size=$(cat ${token_list} | wc -l) |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev |
| | | cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/train |
| | | cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/dev |
| | | fi |
| | | |
| | | # Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | echo "stage 3: Training" |
| | | python utils/download_model.py --model_name ${model_name} # download pretrained model on ModelScope |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | mkdir -p ${exp_dir}/exp/${model_dir}/log |
| | | INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init |
| | | if [ -f $INIT_FILE ];then |
| | | rm -f $INIT_FILE |
| | | fi |
| | | init_method=file://$(readlink -f $INIT_FILE) |
| | | echo "$0: init method is $init_method" |
| | | for ((i = 0; i < $gpu_num; ++i)); do |
| | | { |
| | | rank=$i |
| | | local_rank=$i |
| | | gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
| | | asr_train_paraformer.py \ |
| | | --gpu_id $gpu_id \ |
| | | --use_preprocessor true \ |
| | | --token_type char \ |
| | | --token_list $token_list \ |
| | | --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/${scp},speech,${type} \ |
| | | --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/text,text,text \ |
| | | --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/speech_shape \ |
| | | --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/text_shape.char \ |
| | | --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/${scp},speech,${type} \ |
| | | --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/text,text,text \ |
| | | --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/speech_shape \ |
| | | --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/text_shape.char \ |
| | | --init_param ${init_param} \ |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --input_size $feats_dim \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| | | --dist_init_method $init_method \ |
| | | --dist_world_size $world_size \ |
| | | --dist_rank $rank \ |
| | | --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | | |
| | | # Testing Stage |
| | | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
| | | echo "stage 4: Inference" |
| | | for dset in ${test_sets}; do |
| | | asr_exp=${exp_dir}/exp/${model_dir} |
| | | inference_tag="$(basename "${inference_config}" .yaml)" |
| | | _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" |
| | | _logdir="${_dir}/logdir" |
| | | if [ -d ${_dir} ]; then |
| | | echo "${_dir} is already exists. if you want to decode again, please delete this dir first." |
| | | exit 0 |
| | | fi |
| | | mkdir -p "${_logdir}" |
| | | _data="${feats_dir}/${dumpdir}/${dset}" |
| | | key_file=${_data}/${scp} |
| | | num_scp_file="$(<${key_file} wc -l)" |
| | | _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") |
| | | split_scps= |
| | | for n in $(seq "${_nj}"); do |
| | | split_scps+=" ${_logdir}/keys.${n}.scp" |
| | | done |
| | | # shellcheck disable=SC2086 |
| | | utils/split_scp.pl "${key_file}" ${split_scps} |
| | | _opts= |
| | | if [ -n "${inference_config}" ]; then |
| | | _opts+="--config ${inference_config} " |
| | | fi |
| | | ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ |
| | | python -m funasr.bin.asr_inference_launch \ |
| | | --batch_size 1 \ |
| | | --ngpu "${_ngpu}" \ |
| | | --njob ${njob} \ |
| | | --gpuid_list ${gpuid_list} \ |
| | | --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \ |
| | | --key_file "${_logdir}"/keys.JOB.scp \ |
| | | --asr_train_config "${asr_exp}"/config.yaml \ |
| | | --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ |
| | | --output_dir "${_logdir}"/output.JOB \ |
| | | --mode paraformer \ |
| | | ${_opts} |
| | | |
| | | for f in token token_int score text; do |
| | | if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${_nj}"); do |
| | | cat "${_logdir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${_dir}/${f}" |
| | | fi |
| | | done |
| | | python utils/proce_text.py ${_dir}/text ${_dir}/text.proc |
| | | python utils/proce_text.py ${_data}/text ${_data}/text.proc |
| | | python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer |
| | | tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt |
| | | cat ${_dir}/text.cer.txt |
| | | done |
| | | fi |
| New file |
| | |
| | | ../../aishell/transformer/utils |
copy from egs/aishell/data2vec_finetune/local/aishell_data_prep.sh
copy to egs/aishell/data2vec_transformer_finetune/local/aishell_data_prep.sh
copy from egs/aishell/data2vec_finetune/local/prepare_data.sh
copy to egs/aishell/data2vec_transformer_finetune/local/prepare_data.sh
copy from egs/aishell/data2vec_finetune/path.sh
copy to egs/aishell/data2vec_transformer_finetune/path.sh
| New file |
| | |
| | | # network architecture |
| | | # encoder related |
| | | encoder: data2vec_encoder |
| | | encoder_conf: |
| | | extractor_mode: layer_norm |
| | | encoder_layerdrop: 0.05 |
| | | dropout_input: 0.0 |
| | | dropout_features: 0.0 |
| | | feature_grad_mult: 1.0 |
| | | encoder_embed_dim: 768 |
| | | |
| | | mask_prob: 0.65 |
| | | mask_length: 10 |
| | | |
| | | loss_beta: 0 |
| | | loss_scale: null |
| | | |
| | | instance_norm_target_layer: true |
| | | average_top_k_layers: 8 |
| | | |
| | | pos_conv_depth: 5 |
| | | conv_pos: 95 |
| | | |
| | | ema_decay: 0.999 |
| | | ema_end_decay: 0.9999 |
| | | ema_anneal_end_step: 30000 |
| | | ema_transformer_only: true |
| | | ema_layers_only: true |
| | | |
| | | require_same_masks: true |
| | | mask_dropout: 0 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |
| | | |
| | | # minibatch related |
| | | batch_type: length |
| | | batch_bins: 64000 |
| | | num_workers: 16 |
| | | |
| | | # optimization related |
| | | accum_grad: 1 |
| | | grad_clip: 5 |
| | | patience: none |
| | | max_epoch: 600 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - acc |
| | | best_model_criterion: |
| | | - - valid |
| | | - loss |
| | | - min |
| | | keep_nbest_models: 50 |
| | | unused_parameters: true |
| | | |
| | | optim: fairseq_adam |
| | | optim_conf: |
| | | lr: 0.0005 |
| | | adam_betas: [0.9,0.98] |
| | | adam_eps: 1.0e-06 |
| | | weight_decay: 0.01 |
| | | |
| | | scheduler: tri_stage |
| | | scheduler_conf: |
| | | phase_ratio: [0.03,0.9,0.07] |
| | | |
| | | # for dataset |
| | | dataset_conf: |
| | | batch_mode: clipping |
| | | data_names: speech,none |
| | | data_types: kaldi_ark,none |
| | | shuffle: true |
| | | shuffle_conf: |
| | | shuffle_size: 12800 |
| | | sort_size: 12800 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 64000 |
| | | num_workers: 8 |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) |
| | | # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) |
| | | # Apache 2.0 |
| | | |
| | | # transform raw AISHELL-2 data to kaldi format |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | tmp= |
| | | dir= |
| | | |
| | | if [ $# != 3 ]; then |
| | | echo "Usage: $0 <corpus-data-dir> <tmp-dir> <output-dir>" |
| | | echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train" |
| | | exit 1; |
| | | fi |
| | | |
| | | corpus=$1 |
| | | tmp=$2 |
| | | dir=$3 |
| | | |
| | | echo "prepare_data.sh: Preparing data in $corpus" |
| | | |
| | | mkdir -p $tmp |
| | | mkdir -p $dir |
| | | |
| | | # corpus check |
| | | if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then |
| | | echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." |
| | | exit 1; |
| | | fi |
| | | |
| | | # validate utt-key list, IC0803W0380 is a bad utterance |
| | | awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list |
| | | awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list |
| | | tools/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list |
| | | |
| | | # wav.scp |
| | | awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp |
| | | tools/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp |
| | | |
| | | # text |
| | | tools/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text |
| | | |
| | | # copy prepared resources from tmp_dir to target dir |
| | | mkdir -p $dir |
| | | for f in wav.scp text; do |
| | | cp $tmp/$f $dir/$f || exit 1; |
| | | done |
| | | |
| | | echo "local/prepare_data.sh succeeded" |
| | | exit 0; |
| New file |
| | |
| | | export FUNASR_DIR=$PWD/../../.. |
| | | |
| | | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C |
| | | export PYTHONIOENCODING=UTF-8 |
| | | export PYTHONPATH=../../../:$PYTHONPATH |
| | | export PATH=$FUNASR_DIR/funasr/bin:$PATH |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | # machines configuration |
| | | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" |
| | | gpu_num=8 |
| | | count=1 |
| | | |
| | | train_cmd=tools/run.pl |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary |
| | | exp_dir="." |
| | | lang=zh |
| | | dumpdir=dump/fbank |
| | | feats_type=fbank |
| | | token_type=char |
| | | dataset_type=large |
| | | stage=0 |
| | | stop_stage=4 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | sample_frequency=16000 |
| | | nj=100 |
| | | speed_perturb="0.9,1.0,1.1" |
| | | |
| | | # data |
| | | tr_dir= |
| | | dev_tst_dir= |
| | | |
| | | # exp tag |
| | | tag="exp1" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | train_set=train |
| | | valid_set=dev_ios |
| | | |
| | | asr_config=conf/train_pretrain_transformer.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | | # For training set |
| | | local/prepare_data.sh ${tr_dir} ${feats_dir}/data/local/train ${feats_dir}/data/train || exit 1; |
| | | # # For dev and test set |
| | | for x in Android iOS Mic; do |
| | | local/prepare_data.sh ${dev_tst_dir}/${x}/dev ${feats_dir}/data/local/dev_${x,,} ${feats_dir}/data/dev_${x,,} || exit 1; |
| | | local/prepare_data.sh ${dev_tst_dir}/${x}/test ${feats_dir}/data/local/test_${x,,} ${feats_dir}/data/test_${x,,} || exit 1; |
| | | done |
| | | # Normalize text to capital letters |
| | | for x in train dev_android dev_ios dev_mic test_android test_ios test_mic; do |
| | | mv ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org |
| | | paste -d " " <(cut -f 1 ${feats_dir}/data/${x}/text.org) <(cut -f 2- ${feats_dir}/data/${x}/text.org \ |
| | | | tr 'A-Z' 'a-z' | tr -d " ") \ |
| | | > ${feats_dir}/data/${x}/text |
| | | tools/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org |
| | | mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text |
| | | done |
| | | fi |
| | | |
| | | feat_train_dir=${feats_dir}/${dumpdir}/${train_set}; mkdir -p ${feat_train_dir} |
| | | feat_dev_dir=${feats_dir}/${dumpdir}/${valid_set}; mkdir -p ${feat_dev_dir} |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | echo "stage 1: Feature Generation" |
| | | # compute fbank features |
| | | fbankdir=${feats_dir}/fbank |
| | | steps/compute_fbank.sh --cmd "$train_cmd" --nj $nj --speed_perturb ${speed_perturb} \ |
| | | ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train |
| | | tools/fix_data_feat.sh ${fbankdir}/train |
| | | for x in android ios mic; do |
| | | steps/compute_fbank.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${feats_dir}/data/dev_${x} ${exp_dir}/exp/make_fbank/dev_${x} ${fbankdir}/dev_${x} |
| | | tools/fix_data_feat.sh ${fbankdir}/dev_${x} |
| | | steps/compute_fbank.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${feats_dir}/data/test_${x} ${exp_dir}/exp/make_fbank/test_${x} ${fbankdir}/test_${x} |
| | | tools/fix_data_feat.sh ${fbankdir}/test_${x} |
| | | done |
| | | |
| | | # compute global cmvn |
| | | steps/compute_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/train ${exp_dir}/exp/make_fbank/train |
| | | |
| | | # apply cmvn |
| | | steps/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/${train_set} ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/${train_set} ${feat_train_dir} |
| | | steps/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/${valid_set} ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/${valid_set} ${feat_dev_dir} |
| | | for x in android ios mic; do |
| | | steps/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/test_${x} ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/test_${x} ${feats_dir}/${dumpdir}/test_${x} |
| | | done |
| | | |
| | | cp ${fbankdir}/${train_set}/text ${fbankdir}/${train_set}/speech_shape ${fbankdir}/${train_set}/text_shape ${feat_train_dir} |
| | | tools/fix_data_feat.sh ${feat_train_dir} |
| | | cp ${fbankdir}/${valid_set}/text ${fbankdir}/${valid_set}/speech_shape ${fbankdir}/${valid_set}/text_shape ${feat_dev_dir} |
| | | tools/fix_data_feat.sh ${feat_dev_dir} |
| | | for x in android ios mic; do |
| | | cp ${fbankdir}/test_${x}/text ${fbankdir}/test_${x}/speech_shape ${fbankdir}/test_${x}/text_shape ${feats_dir}/${dumpdir}/test_${x} |
| | | tools/fix_data_feat.sh ${feats_dir}/${dumpdir}/test_${x} |
| | | done |
| | | fi |
| | | |
| | | token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt |
| | | echo "dictionary: ${token_list}" |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | echo "stage 2: Dictionary Preparation" |
| | | mkdir -p ${feats_dir}/data/${lang}_token_list/char/ |
| | | |
| | | echo "make a dictionary" |
| | | echo "<blank>" > ${token_list} |
| | | echo "<s>" >> ${token_list} |
| | | echo "</s>" >> ${token_list} |
| | | tools/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/${train_set}/text | cut -f 2- -d" " | tr " " "\n" \ |
| | | | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} |
| | | num_token=$(cat ${token_list} | wc -l) |
| | | echo "<unk>" >> ${token_list} |
| | | vocab_size=$(cat ${token_list} | wc -l) |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${train_set} |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/${valid_set} |
| | | cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/${train_set} |
| | | cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/${valid_set} |
| | | fi |
| | | |
| | | # Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | echo "stage 3: Training" |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | mkdir -p ${exp_dir}/exp/${model_dir}/log |
| | | INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init |
| | | if [ -f $INIT_FILE ];then |
| | | rm -f $INIT_FILE |
| | | fi |
| | | init_method=file://$(readlink -f $INIT_FILE) |
| | | echo "$0: init method is $init_method" |
| | | for ((i = 0; i < $gpu_num; ++i)); do |
| | | { |
| | | rank=$i |
| | | local_rank=$i |
| | | gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
| | | data2vec_train.py \ |
| | | --gpu_id $gpu_id \ |
| | | --use_preprocessor true \ |
| | | --dataset_type $dataset_type \ |
| | | --train_data_file $feats_dir/$dumpdir/${train_set}/data.list \ |
| | | --valid_data_file $feats_dir/$dumpdir/${valid_set}/data.list \ |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --input_size $feats_dim \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| | | --dist_init_method $init_method \ |
| | | --dist_world_size $world_size \ |
| | | --dist_rank $rank \ |
| | | --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| New file |
| | |
| | | ../../aishell/transformer/utils |
| | |
| | | self.nbest = nbest |
| | | self.frontend = frontend |
| | | self.encoder_downsampling_factor = 1 |
| | | if asr_train_args.encoder_conf["input_layer"] == "conv2d": |
| | | if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d": |
| | | self.encoder_downsampling_factor = 4 |
| | | |
| | | @torch.no_grad() |
| New file |
| | |
| | | #!/usr/bin/env python3 |
| | | |
| | | import os |
| | | |
| | | from funasr.tasks.data2vec import Data2VecTask |
| | | |
| | | |
| | | def parse_args(): |
| | | parser = Data2VecTask.get_parser() |
| | | parser.add_argument( |
| | | "--gpu_id", |
| | | type=int, |
| | | default=0, |
| | | help="local gpu id.", |
| | | ) |
| | | args = parser.parse_args() |
| | | return args |
| | | |
| | | |
| | | def main(args=None, cmd=None): |
| | | # for data2vec Training |
| | | Data2VecTask.main(args=args, cmd=cmd) |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | args = parse_args() |
| | | |
| | | # setup local gpu_id |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id) |
| | | |
| | | # DDP settings |
| | | if args.ngpu > 1: |
| | | args.distributed = True |
| | | else: |
| | | args.distributed = False |
| | | assert args.num_worker_count == 1 |
| | | |
| | | # re-compute batch size: when dataset type is small |
| | | if args.dataset_type == "small": |
| | | if args.batch_size is not None: |
| | | args.batch_size = args.batch_size * args.ngpu |
| | | if args.batch_bins is not None: |
| | | args.batch_bins = args.batch_bins * args.ngpu |
| | | |
| | | main(args=args) |
| | |
| | | |
| | | output = (uttids, output) |
| | | assert check_return_type(output) |
| | | return output |
| | | |
| | | def crop_to_max_size(feature, target_size): |
| | | size = len(feature) |
| | | diff = size - target_size |
| | | if diff <= 0: |
| | | return feature |
| | | |
| | | start = np.random.randint(0, diff + 1) |
| | | end = size - diff + start |
| | | return feature[start:end] |
| | | |
| | | |
| | | def clipping_collate_fn( |
| | | data: Collection[Tuple[str, Dict[str, np.ndarray]]], |
| | | max_sample_size=None, |
| | | not_sequence: Collection[str] = (), |
| | | ) -> Tuple[List[str], Dict[str, torch.Tensor]]: |
| | | # mainly for pre-training |
| | | assert check_argument_types() |
| | | uttids = [u for u, _ in data] |
| | | data = [d for _, d in data] |
| | | |
| | | assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching" |
| | | assert all( |
| | | not k.endswith("_lengths") for k in data[0] |
| | | ), f"*_lengths is reserved: {list(data[0])}" |
| | | |
| | | output = {} |
| | | for key in data[0]: |
| | | array_list = [d[key] for d in data] |
| | | tensor_list = [torch.from_numpy(a) for a in array_list] |
| | | sizes = [len(s) for s in tensor_list] |
| | | if max_sample_size is None: |
| | | target_size = min(sizes) |
| | | else: |
| | | target_size = min(min(sizes), max_sample_size) |
| | | tensor = tensor_list[0].new_zeros(len(tensor_list), target_size, tensor_list[0].shape[1]) |
| | | for i, (source, size) in enumerate(zip(tensor_list, sizes)): |
| | | diff = size - target_size |
| | | if diff == 0: |
| | | tensor[i] = source |
| | | else: |
| | | tensor[i] = crop_to_max_size(source, target_size) |
| | | output[key] = tensor |
| | | |
| | | if key not in not_sequence: |
| | | lens = torch.tensor([source.shape[0] for source in tensor], dtype=torch.long) |
| | | output[key + "_lengths"] = lens |
| | | |
| | | output = (uttids, output) |
| | | assert check_return_type(output) |
| | | return output |
| | |
| | | |
| | | class ArkDataLoader(AbsIterFactory): |
| | | def __init__(self, data_list, dict_file, dataset_conf, seg_dict_file=None, mode="train"): |
| | | symbol_table = read_symbol_table(dict_file) |
| | | symbol_table = read_symbol_table(dict_file) if dict_file is not None else None |
| | | if seg_dict_file is not None: |
| | | seg_dict = load_seg_dict(seg_dict_file) |
| | | else: |
| | | seg_dict = None |
| | | self.dataset_conf = dataset_conf |
| | | logging.info("dataloader config: {}".format(self.dataset_conf)) |
| | | batch_mode = self.dataset_conf.get("batch_mode", "padding") |
| | | self.dataset = Dataset(data_list, symbol_table, seg_dict, |
| | | self.dataset_conf, mode=mode) |
| | | self.dataset_conf, mode=mode, batch_mode=batch_mode) |
| | | |
| | | def build_iter(self, epoch, shuffle=True): |
| | | self.dataset.set_epoch(epoch) |
| | |
| | | batch_size=8000, |
| | | len_fn=_default_len_fn, |
| | | buffer_size=10240, |
| | | sort_size=500 |
| | | sort_size=500, |
| | | batch_mode="padding", |
| | | ): |
| | | assert batch_size > 0, "Batch size is required to be larger than 0!" |
| | | assert buffer_size >= -1, "Buffer size is required to be larger than -1!" |
| | |
| | | self.batch_size = batch_size |
| | | self.buffer_size = buffer_size |
| | | self.sort_size = sort_size |
| | | self.batch_mode = batch_mode |
| | | |
| | | def set_epoch(self, epoch): |
| | | self.epoch = epoch |
| | |
| | | batch = [] |
| | | bucket = [] |
| | | max_lengths = 0 |
| | | min_lengths = 999999 |
| | | batch_lengths = 0 |
| | | |
| | | if self.buffer_size == -1: |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | buffer.append(d) |
| | | buffer.sort() |
| | | for sample in buffer: |
| | | length, _, token = sample |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | bucket.append(batch) |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | random.shuffle(bucket) |
| | | if bucket: |
| | | for batch_sample in bucket: |
| | | yield batch_sample |
| | | if batch: |
| | | yield batch |
| | | |
| | | elif self.buffer_size == 0: |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | length, _, token = d |
| | | if length > self.batch_size: |
| | | continue |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | if batch: |
| | | yield batch |
| | | |
| | | else: |
| | | if self.batch_mode == "clipping": |
| | | assert self.buffer_size > 0, "for clipping batch_mode, buffer_size must be > 1" |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | buffer.append(d) |
| | | if len(buffer) == self.buffer_size: |
| | | random.shuffle(buffer) |
| | | for sample in buffer: |
| | | bucket.append(sample) |
| | | if len(bucket) == self.sort_size: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length < min_lengths: |
| | | min_lengths = length |
| | | batch_lengths = min_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | min_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | buffer = [] |
| | | |
| | | if buffer: |
| | | random.shuffle(buffer) |
| | | for sample in buffer: |
| | | bucket.append(sample) |
| | | if len(bucket) == self.sort_size: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length < min_lengths: |
| | | min_lengths = length |
| | | batch_lengths = min_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | min_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | buffer = [] |
| | | |
| | | if bucket: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length < min_lengths: |
| | | min_lengths = length |
| | | batch_lengths = min_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | min_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | |
| | | if batch: |
| | | yield batch |
| | | |
| | | else: |
| | | if self.buffer_size == -1: |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | buffer.append(d) |
| | | buffer.sort() |
| | | for sample in buffer: |
| | | length, _, token = sample |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | bucket.append(batch) |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | random.shuffle(bucket) |
| | | if bucket: |
| | | for batch_sample in bucket: |
| | | yield batch_sample |
| | | if batch: |
| | | yield batch |
| | | |
| | | elif self.buffer_size == 0: |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | length, _, token = d |
| | | if length > self.batch_size: |
| | | continue |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | if batch: |
| | | yield batch |
| | | |
| | | else: |
| | | for d in self.datapipe: |
| | | if d[0] > self.batch_size: |
| | | continue |
| | | buffer.append(d) |
| | | if len(buffer) == self.buffer_size: |
| | | random.shuffle(buffer) |
| | | for sample in buffer: |
| | | bucket.append(sample) |
| | | if len(bucket) == self.sort_size: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | buffer = [] |
| | | |
| | | if buffer: |
| | | random.shuffle(buffer) |
| | | for sample in buffer: |
| | | bucket.append(sample) |
| | |
| | | bucket = [] |
| | | buffer = [] |
| | | |
| | | if buffer: |
| | | random.shuffle(buffer) |
| | | for sample in buffer: |
| | | bucket.append(sample) |
| | | if len(bucket) == self.sort_size: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | buffer = [] |
| | | if bucket: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | |
| | | if bucket: |
| | | bucket.sort() |
| | | for x in bucket: |
| | | length, _, token = x |
| | | if length > max_lengths: |
| | | max_lengths = length |
| | | batch_lengths = max_lengths * (len(batch) + 1) |
| | | if batch_lengths > self.batch_size: |
| | | yield batch |
| | | batch = [] |
| | | max_lengths = length |
| | | batch.append(token) |
| | | bucket = [] |
| | | |
| | | if batch: |
| | | yield batch |
| | | if batch: |
| | | yield batch |
| | |
| | | from funasr.datasets.large_datasets.datapipes.map import MapperIterDataPipe |
| | | from funasr.datasets.large_datasets.utils.filter import filter |
| | | from funasr.datasets.large_datasets.utils.padding import padding |
| | | from funasr.datasets.large_datasets.utils.clipping import clipping |
| | | from funasr.datasets.large_datasets.utils.tokenize import tokenize |
| | | |
| | | |
| | |
| | | elif data_type == "text" or data_type == "sound": |
| | | text_reader = open(data_file, "r") |
| | | reader_list.append(text_reader) |
| | | elif data_type == "none": |
| | | continue |
| | | else: |
| | | raise TypeError("Data type {} is not supported".format(data_type)) |
| | | |
| | |
| | | dict, |
| | | seg_dict, |
| | | conf, |
| | | mode="train"): |
| | | mode="train", |
| | | batch_mode="padding"): |
| | | scp_lists = read_lists(data_list_file) |
| | | shuffle = conf.get('shuffle', True) |
| | | data_names = conf.get("data_names", "speech,text") |
| | |
| | | filter_fn = partial(filter, **filter_conf) |
| | | dataset = FilterIterDataPipe(dataset, fn=filter_fn) |
| | | |
| | | vocab = {'vocab': dict, 'seg_dict': seg_dict} |
| | | tokenize_fn = partial(tokenize, **vocab) |
| | | dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) |
| | | if "text" in data_names: |
| | | vocab = {'vocab': dict, 'seg_dict': seg_dict} |
| | | tokenize_fn = partial(tokenize, **vocab) |
| | | dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) |
| | | |
| | | if shuffle: |
| | | buffer_conf = conf.get('shuffle_conf', {}) |
| | |
| | | batch_size=batch_size, |
| | | len_fn=len_fn, |
| | | buffer_size=buffer_size, |
| | | sort_size=sort_size) |
| | | sort_size=sort_size, |
| | | batch_mode=batch_mode) |
| | | |
| | | dataset = MapperIterDataPipe(dataset, fn=padding) |
| | | dataset = MapperIterDataPipe(dataset, fn=padding if batch_mode == "padding" else clipping) |
| | | |
| | | return dataset |
| New file |
| | |
| | | import numpy as np |
| | | import torch |
| | | |
| | | from funasr.datasets.collate_fn import crop_to_max_size |
| | | |
| | | |
| | | def clipping(data): |
| | | assert isinstance(data, list) |
| | | assert "key" in data[0] |
| | | |
| | | keys = [x["key"] for x in data] |
| | | |
| | | batch = {} |
| | | data_names = data[0].keys() |
| | | for data_name in data_names: |
| | | if data_name == "key": |
| | | continue |
| | | else: |
| | | if data[0][data_name].dtype.kind == "i": |
| | | tensor_type = torch.int64 |
| | | else: |
| | | tensor_type = torch.float32 |
| | | |
| | | tensor_list = [torch.tensor(np.copy(d[data_name]), dtype=tensor_type) for d in data] |
| | | tensor_lengths = torch.tensor([len(d[data_name]) for d in data], dtype=torch.int32) |
| | | |
| | | length_clip = min(tensor_lengths) |
| | | tensor_clip = tensor_list[0].new_zeros(len(tensor_list), length_clip, tensor_list[0].shape[1]) |
| | | for i, (tensor, length) in enumerate(zip(tensor_list, tensor_lengths)): |
| | | diff = length - length_clip |
| | | assert diff >= 0 |
| | | if diff == 0: |
| | | tensor_clip[i] = tensor |
| | | else: |
| | | tensor_clip[i] = crop_to_max_size(tensor, length_clip) |
| | | |
| | | batch[data_name] = tensor_clip |
| | | batch[data_name + "_lengths"] = torch.tensor([tensor.shape[0] for tensor in tensor_clip], dtype=torch.long) |
| | | |
| | | return keys, batch |
| | |
| | | speech_length_max=15000, |
| | | token_length_min=0, |
| | | token_length_max=200): |
| | | assert "speech" in data |
| | | assert "text" in data |
| | | assert "speech" in data or "text" in data |
| | | |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000. |
| | | if "speech" in data and "text" in data: |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000. |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | num_tokens = len(data['text']) |
| | | return speech_length_min < speech_length < speech_length_max and token_length_min < num_tokens < token_length_max |
| | | elif "speech" in data: |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000. |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | return speech_length_min < speech_length < speech_length_max |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | num_tokens = len(data['text']) |
| | | |
| | | return speech_length_min < speech_length < speech_length_max and token_length_min < num_tokens < token_length_max |
| | | num_tokens = len(data['text']) |
| | | return token_length_min < num_tokens < token_length_max |
| New file |
| | |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # |
| | | # This source code is licensed under the MIT license found in the |
| | | # LICENSE file in the root directory of this source tree. |
| | | |
| | | from contextlib import contextmanager |
| | | from distutils.version import LooseVersion |
| | | from typing import Dict |
| | | from typing import Optional |
| | | from typing import Tuple |
| | | |
| | | import torch |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.layers.abs_normalize import AbsNormalize |
| | | from funasr.models.encoder.abs_encoder import AbsEncoder |
| | | from funasr.models.frontend.abs_frontend import AbsFrontend |
| | | from funasr.models.preencoder.abs_preencoder import AbsPreEncoder |
| | | from funasr.models.specaug.abs_specaug import AbsSpecAug |
| | | from funasr.torch_utils.device_funcs import force_gatherable |
| | | from funasr.train.abs_espnet_model import AbsESPnetModel |
| | | |
| | | if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): |
| | | from torch.cuda.amp import autocast |
| | | else: |
| | | # Nothing to do if torch<1.6.0 |
| | | @contextmanager |
| | | def autocast(enabled=True): |
| | | yield |
| | | |
| | | |
| | | class Data2VecPretrainModel(AbsESPnetModel): |
| | | """Data2Vec Pretrain model""" |
| | | |
| | | def __init__( |
| | | self, |
| | | frontend: Optional[AbsFrontend], |
| | | specaug: Optional[AbsSpecAug], |
| | | normalize: Optional[AbsNormalize], |
| | | preencoder: Optional[AbsPreEncoder], |
| | | encoder: AbsEncoder, |
| | | ): |
| | | assert check_argument_types() |
| | | |
| | | super().__init__() |
| | | |
| | | self.frontend = frontend |
| | | self.specaug = specaug |
| | | self.normalize = normalize |
| | | self.preencoder = preencoder |
| | | self.encoder = encoder |
| | | self.num_updates = 0 |
| | | |
| | | def forward( |
| | | self, |
| | | speech: torch.Tensor, |
| | | speech_lengths: torch.Tensor, |
| | | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: |
| | | """Frontend + Encoder + Calc loss |
| | | |
| | | Args: |
| | | speech: (Batch, Length, ...) |
| | | speech_lengths: (Batch, ) |
| | | """ |
| | | # Check that batch_size is unified |
| | | assert ( |
| | | speech.shape[0] |
| | | == speech_lengths.shape[0] |
| | | ), (speech.shape, speech_lengths.shape) |
| | | |
| | | self.encoder.set_num_updates(self.num_updates) |
| | | |
| | | # 1. Encoder |
| | | encoder_out = self.encode(speech, speech_lengths) |
| | | |
| | | losses = encoder_out["losses"] |
| | | loss = sum(losses.values()) |
| | | sample_size = encoder_out["sample_size"] |
| | | loss = loss.sum() / sample_size |
| | | |
| | | target_var = float(encoder_out["target_var"]) |
| | | pred_var = float(encoder_out["pred_var"]) |
| | | ema_decay = float(encoder_out["ema_decay"]) |
| | | |
| | | stats = dict( |
| | | loss=torch.clone(loss.detach()), |
| | | target_var=target_var, |
| | | pred_var=pred_var, |
| | | ema_decay=ema_decay, |
| | | ) |
| | | |
| | | loss, stats, weight = force_gatherable((loss, stats, sample_size), loss.device) |
| | | return loss, stats, weight |
| | | |
| | | def collect_feats( |
| | | self, |
| | | speech: torch.Tensor, |
| | | speech_lengths: torch.Tensor |
| | | ) -> Dict[str, torch.Tensor]: |
| | | feats, feats_lengths = self._extract_feats(speech, speech_lengths) |
| | | return {"feats": feats, "feats_lengths": feats_lengths} |
| | | |
| | | def encode( |
| | | self, |
| | | speech: torch.Tensor, |
| | | speech_lengths: torch.Tensor, |
| | | ): |
| | | """Frontend + Encoder. |
| | | |
| | | Args: |
| | | speech: (Batch, Length, ...) |
| | | speech_lengths: (Batch, ) |
| | | """ |
| | | with autocast(False): |
| | | # 1. Extract feats |
| | | feats, feats_lengths = self._extract_feats(speech, speech_lengths) |
| | | |
| | | # 2. Data augmentation |
| | | if self.specaug is not None and self.training: |
| | | feats, feats_lengths = self.specaug(feats, feats_lengths) |
| | | |
| | | # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN |
| | | if self.normalize is not None: |
| | | feats, feats_lengths = self.normalize(feats, feats_lengths) |
| | | |
| | | # Pre-encoder, e.g. used for raw input data |
| | | if self.preencoder is not None: |
| | | feats, feats_lengths = self.preencoder(feats, feats_lengths) |
| | | |
| | | # 4. Forward encoder |
| | | if min(speech_lengths) == max(speech_lengths): # for clipping, set speech_lengths as None |
| | | speech_lengths = None |
| | | encoder_out = self.encoder(feats, speech_lengths, mask=True, features_only=False) |
| | | |
| | | return encoder_out |
| | | |
| | | def _extract_feats( |
| | | self, speech: torch.Tensor, speech_lengths: torch.Tensor |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | assert speech_lengths.dim() == 1, speech_lengths.shape |
| | | |
| | | # for data-parallel |
| | | speech = speech[:, : speech_lengths.max()] |
| | | |
| | | if self.frontend is not None: |
| | | # Frontend |
| | | # e.g. STFT and Feature extract |
| | | # data_loader may send time-domain signal in this case |
| | | # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim) |
| | | feats, feats_lengths = self.frontend(speech, speech_lengths) |
| | | else: |
| | | # No frontend and no feature extract |
| | | feats, feats_lengths = speech, speech_lengths |
| | | return feats, feats_lengths |
| | | |
| | | def set_num_updates(self, num_updates): |
| | | self.num_updates = num_updates |
| | | |
| | | def get_num_updates(self): |
| | | return self.num_updates |
| New file |
| | |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # |
| | | # This source code is licensed under the MIT license found in the |
| | | # LICENSE file in the root directory of this source tree. |
| | | |
| | | |
| | | import math |
| | | |
| | | import torch |
| | | import torch.optim |
| | | |
| | | |
| | | class FairseqAdam(torch.optim.Optimizer): |
| | | r"""Implements Adam algorithm. |
| | | |
| | | This implementation is modified from torch.optim.Adam based on: |
| | | `Fixed Weight Decay Regularization in Adam` |
| | | (see https://arxiv.org/abs/1711.05101) |
| | | |
| | | It has been proposed in `Adam: A Method for Stochastic Optimization`_. |
| | | |
| | | Args: |
| | | params (iterable): iterable of parameters to optimize or dicts defining |
| | | parameter groups |
| | | lr (float, optional): learning rate (default: 1e-3) |
| | | betas (Tuple[float, float], optional): coefficients used for computing |
| | | running averages of gradient and its square (default: (0.9, 0.999)) |
| | | eps (float, optional): term added to the denominator to improve |
| | | numerical stability (default: 1e-8) |
| | | weight_decay (float, optional): weight decay (L2 penalty) (default: 0) |
| | | amsgrad (boolean, optional): whether to use the AMSGrad variant of this |
| | | algorithm from the paper `On the Convergence of Adam and Beyond`_ |
| | | |
| | | .. _Adam\: A Method for Stochastic Optimization: |
| | | https://arxiv.org/abs/1412.6980 |
| | | .. _On the Convergence of Adam and Beyond: |
| | | https://openreview.net/forum?id=ryQu7f-RZ |
| | | """ |
| | | |
| | | def __init__( |
| | | self, |
| | | params, |
| | | lr=1e-3, |
| | | adam_betas=(0.9, 0.999), |
| | | adam_eps=1e-8, |
| | | weight_decay=0, |
| | | amsgrad=False, |
| | | ): |
| | | defaults = dict( |
| | | lr=lr, betas=adam_betas, eps=adam_eps, weight_decay=weight_decay, amsgrad=amsgrad |
| | | ) |
| | | super(FairseqAdam, self).__init__(params, defaults) |
| | | self.optimizer_lr = lr |
| | | |
| | | @property |
| | | def supports_memory_efficient_fp16(self): |
| | | return True |
| | | |
| | | @property |
| | | def supports_flat_params(self): |
| | | return True |
| | | |
| | | def step(self, closure=None): |
| | | """Performs a single optimization step. |
| | | |
| | | Args: |
| | | closure (callable, optional): A closure that reevaluates the model |
| | | and returns the loss. |
| | | """ |
| | | loss = None |
| | | if closure is not None: |
| | | loss = closure() |
| | | |
| | | for group in self.param_groups: |
| | | for p in group["params"]: |
| | | if p.grad is None: |
| | | continue |
| | | grad = p.grad.data |
| | | if grad.dtype in {torch.float16, torch.bfloat16}: |
| | | grad = grad.float() |
| | | if grad.is_sparse: |
| | | raise RuntimeError( |
| | | "Adam does not support sparse gradients, please consider SparseAdam instead" |
| | | ) |
| | | amsgrad = group.get("amsgrad", False) |
| | | |
| | | p_data_fp32 = p.data |
| | | if p.data.dtype in {torch.float16, torch.bfloat16}: |
| | | p_data_fp32 = p_data_fp32.float() |
| | | |
| | | state = self.state[p] |
| | | |
| | | # State initialization |
| | | if len(state) == 0: |
| | | state["step"] = 0 |
| | | # Exponential moving average of gradient values |
| | | state["exp_avg"] = torch.zeros_like(p_data_fp32) |
| | | # Exponential moving average of squared gradient values |
| | | state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) |
| | | if amsgrad: |
| | | # Maintains max of all exp. moving avg. of sq. grad. values |
| | | state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32) |
| | | else: |
| | | state["exp_avg"] = state["exp_avg"].to(p_data_fp32) |
| | | state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) |
| | | if amsgrad: |
| | | state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to( |
| | | p_data_fp32 |
| | | ) |
| | | |
| | | exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] |
| | | if amsgrad: |
| | | max_exp_avg_sq = state["max_exp_avg_sq"] |
| | | beta1, beta2 = group["betas"] |
| | | |
| | | state["step"] += 1 |
| | | |
| | | # Decay the first and second moment running average coefficient |
| | | exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) |
| | | exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) |
| | | if amsgrad: |
| | | # Maintains the maximum of all 2nd moment running avg. till now |
| | | torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) |
| | | # Use the max. for normalizing running avg. of gradient |
| | | denom = max_exp_avg_sq.sqrt().add_(group["eps"]) |
| | | else: |
| | | denom = exp_avg_sq.sqrt().add_(group["eps"]) |
| | | |
| | | bias_correction1 = 1 - beta1 ** state["step"] |
| | | bias_correction2 = 1 - beta2 ** state["step"] |
| | | step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1 |
| | | |
| | | if group["weight_decay"] != 0: |
| | | p_data_fp32.add_( |
| | | p_data_fp32, alpha=-group["weight_decay"] * group["lr"] |
| | | ) |
| | | |
| | | p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size) |
| | | |
| | | if p.data.dtype in {torch.float16, torch.bfloat16}: |
| | | p.data.copy_(p_data_fp32) |
| | | |
| | | return loss |
| | | |
| | | def set_lr(self, lr): |
| | | """Set the learning rate.""" |
| | | for param_group in self.param_groups: |
| | | param_group["lr"] = lr |
| New file |
| | |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # |
| | | # This source code is licensed under the MIT license found in the |
| | | # LICENSE file in the root directory of this source tree. |
| | | |
| | | import math |
| | | from typing import Optional, List |
| | | |
| | | import torch |
| | | from torch.optim.lr_scheduler import _LRScheduler |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.schedulers.abs_scheduler import AbsBatchStepScheduler |
| | | |
| | | |
| | | class TriStageLR(_LRScheduler, AbsBatchStepScheduler): |
| | | def __init__( |
| | | self, |
| | | optimizer: torch.optim.Optimizer, |
| | | last_epoch: int = -1, |
| | | phase_ratio: Optional[List[float]] = None, |
| | | init_lr_scale: float = 0.01, |
| | | final_lr_scale: float = 0.01, |
| | | ): |
| | | assert check_argument_types() |
| | | self.optimizer = optimizer |
| | | self.last_epoch = last_epoch |
| | | self.phase_ratio = phase_ratio |
| | | self.init_lr_scale = init_lr_scale |
| | | self.final_lr_scale = final_lr_scale |
| | | self.optimizer_lr = self.optimizer.defaults["lr"] |
| | | |
| | | def init_tri_stage_scheudler(self, max_update): |
| | | self.max_update = max_update |
| | | self.peak_lr = self.optimizer_lr |
| | | self.init_lr = self.init_lr_scale * self.optimizer_lr |
| | | self.final_lr = self.final_lr_scale * self.optimizer_lr |
| | | |
| | | assert self.max_update > 0 |
| | | assert sum(self.phase_ratio) == 1, "phase ratios must add up to 1" |
| | | assert len(self.phase_ratio) == 3 |
| | | self.warmup_steps = int(self.max_update * self.phase_ratio[0]) |
| | | self.hold_steps = int(self.max_update * self.phase_ratio[1]) |
| | | self.decay_steps = int(self.max_update * self.phase_ratio[2]) |
| | | |
| | | self.warmup_rate = ( |
| | | (self.peak_lr - self.init_lr) / self.warmup_steps |
| | | if self.warmup_steps != 0 |
| | | else 0 |
| | | ) |
| | | self.decay_factor = -math.log(self.final_lr_scale) / self.decay_steps |
| | | |
| | | # initial learning rate |
| | | self.lr = self.init_lr |
| | | |
| | | # __init__() must be invoked before setting field |
| | | # because step() is also invoked in __init__() |
| | | self.set_optimizer_lr(self.lr) |
| | | super().__init__(self.optimizer, self.last_epoch) |
| | | |
| | | def _decide_stage(self, update_step): |
| | | """ |
| | | return stage, and the corresponding steps within the current stage |
| | | """ |
| | | if update_step < self.warmup_steps: |
| | | # warmup state |
| | | return 0, update_step |
| | | |
| | | offset = self.warmup_steps |
| | | |
| | | if update_step < offset + self.hold_steps: |
| | | # hold stage |
| | | return 1, update_step - offset |
| | | |
| | | offset += self.hold_steps |
| | | |
| | | if update_step <= offset + self.decay_steps: |
| | | # decay stage |
| | | return 2, update_step - offset |
| | | |
| | | offset += self.decay_steps |
| | | |
| | | # still here ? constant lr stage |
| | | return 3, update_step - offset |
| | | |
| | | def step_update(self, num_updates): |
| | | """Update the learning rate after each update.""" |
| | | stage, steps_in_stage = self._decide_stage(num_updates) |
| | | if stage == 0: |
| | | self.lr = self.init_lr + self.warmup_rate * steps_in_stage |
| | | elif stage == 1: |
| | | self.lr = self.peak_lr |
| | | elif stage == 2: |
| | | self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) |
| | | elif stage == 3: |
| | | self.lr = self.final_lr |
| | | else: |
| | | raise ValueError("Undefined stage") |
| | | self.set_optimizer_lr(self.lr) |
| | | |
| | | def set_optimizer_lr(self, lr): |
| | | for param_group in self.optimizer.param_groups: |
| | | param_group["lr"] = lr |
| | | |
| | | def get_lr(self): |
| | | step_num = self.last_epoch + 1 |
| | | self.step_update(step_num) |
| | | return [self.lr] |
| | |
| | | from funasr.iterators.multiple_iter_factory import MultipleIterFactory |
| | | from funasr.iterators.sequence_iter_factory import SequenceIterFactory |
| | | from funasr.optimizers.sgd import SGD |
| | | from funasr.optimizers.fairseq_adam import FairseqAdam |
| | | from funasr.samplers.build_batch_sampler import BATCH_TYPES |
| | | from funasr.samplers.build_batch_sampler import build_batch_sampler |
| | | from funasr.samplers.unsorted_batch_sampler import UnsortedBatchSampler |
| | | from funasr.schedulers.noam_lr import NoamLR |
| | | from funasr.schedulers.warmup_lr import WarmupLR |
| | | from funasr.schedulers.tri_stage_scheduler import TriStageLR |
| | | from funasr.torch_utils.load_pretrained_model import load_pretrained_model |
| | | from funasr.torch_utils.model_summary import model_summary |
| | | from funasr.torch_utils.pytorch_version import pytorch_cudnn_version |
| | |
| | | |
| | | optim_classes = dict( |
| | | adam=torch.optim.Adam, |
| | | fairseq_adam=FairseqAdam, |
| | | adamw=torch.optim.AdamW, |
| | | sgd=SGD, |
| | | adadelta=torch.optim.Adadelta, |
| | |
| | | CosineAnnealingLR=torch.optim.lr_scheduler.CosineAnnealingLR, |
| | | noamlr=NoamLR, |
| | | warmuplr=WarmupLR, |
| | | tri_stage=TriStageLR, |
| | | cycliclr=torch.optim.lr_scheduler.CyclicLR, |
| | | onecyclelr=torch.optim.lr_scheduler.OneCycleLR, |
| | | CosineAnnealingWarmRestarts=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts, |
| New file |
| | |
| | | import argparse |
| | | from typing import Callable |
| | | from typing import Collection |
| | | from typing import Dict |
| | | from typing import List |
| | | from typing import Optional |
| | | from typing import Tuple |
| | | |
| | | import numpy as np |
| | | import torch |
| | | from typeguard import check_argument_types |
| | | from typeguard import check_return_type |
| | | |
| | | from funasr.datasets.collate_fn import CommonCollateFn |
| | | from funasr.datasets.preprocessor import CommonPreprocessor |
| | | from funasr.layers.abs_normalize import AbsNormalize |
| | | from funasr.layers.global_mvn import GlobalMVN |
| | | from funasr.layers.utterance_mvn import UtteranceMVN |
| | | from funasr.models.data2vec import Data2VecPretrainModel |
| | | from funasr.models.encoder.abs_encoder import AbsEncoder |
| | | from funasr.models.encoder.data2vec_encoder import Data2VecEncoder |
| | | from funasr.models.frontend.abs_frontend import AbsFrontend |
| | | from funasr.models.frontend.default import DefaultFrontend |
| | | from funasr.models.frontend.windowing import SlidingWindow |
| | | from funasr.models.preencoder.abs_preencoder import AbsPreEncoder |
| | | from funasr.models.preencoder.sinc import LightweightSincConvs |
| | | from funasr.models.specaug.abs_specaug import AbsSpecAug |
| | | from funasr.models.specaug.specaug import SpecAug |
| | | from funasr.tasks.abs_task import AbsTask |
| | | from funasr.text.phoneme_tokenizer import g2p_choices |
| | | from funasr.torch_utils.initialize import initialize |
| | | from funasr.train.class_choices import ClassChoices |
| | | from funasr.train.trainer import Trainer |
| | | from funasr.utils.types import float_or_none |
| | | from funasr.utils.types import int_or_none |
| | | from funasr.utils.types import str2bool |
| | | from funasr.utils.types import str_or_none |
| | | |
| | | frontend_choices = ClassChoices( |
| | | name="frontend", |
| | | classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow), |
| | | type_check=AbsFrontend, |
| | | default="default", |
| | | ) |
| | | specaug_choices = ClassChoices( |
| | | name="specaug", |
| | | classes=dict(specaug=SpecAug), |
| | | type_check=AbsSpecAug, |
| | | default=None, |
| | | optional=True, |
| | | ) |
| | | normalize_choices = ClassChoices( |
| | | "normalize", |
| | | classes=dict( |
| | | global_mvn=GlobalMVN, |
| | | utterance_mvn=UtteranceMVN, |
| | | ), |
| | | type_check=AbsNormalize, |
| | | default=None, |
| | | optional=True, |
| | | ) |
| | | preencoder_choices = ClassChoices( |
| | | name="preencoder", |
| | | classes=dict( |
| | | sinc=LightweightSincConvs, |
| | | ), |
| | | type_check=AbsPreEncoder, |
| | | default=None, |
| | | optional=True, |
| | | ) |
| | | encoder_choices = ClassChoices( |
| | | "encoder", |
| | | classes=dict( |
| | | data2vec_encoder=Data2VecEncoder, |
| | | ), |
| | | type_check=AbsEncoder, |
| | | default="data2vec_encoder", |
| | | ) |
| | | model_choices = ClassChoices( |
| | | "model", |
| | | classes=dict( |
| | | data2vec=Data2VecPretrainModel, |
| | | ), |
| | | default="data2vec", |
| | | ) |
| | | |
| | | |
| | | class Data2VecTask(AbsTask): |
| | | # If you need more than one optimizers, change this value |
| | | num_optimizers: int = 1 |
| | | |
| | | # Add variable objects configurations |
| | | class_choices_list = [ |
| | | # --frontend and --frontend_conf |
| | | frontend_choices, |
| | | # --specaug and --specaug_conf |
| | | specaug_choices, |
| | | # --normalize and --normalize_conf |
| | | normalize_choices, |
| | | # --preencoder and --preencoder_conf |
| | | preencoder_choices, |
| | | # --encoder and --encoder_conf |
| | | encoder_choices, |
| | | # --model and --model_conf |
| | | model_choices, |
| | | ] |
| | | |
| | | # If you need to modify train() or eval() procedures, change Trainer class here |
| | | trainer = Trainer |
| | | |
| | | @classmethod |
| | | def add_task_arguments(cls, parser: argparse.ArgumentParser): |
| | | group = parser.add_argument_group(description="Task related") |
| | | |
| | | # NOTE(kamo): add_arguments(..., required=True) can't be used |
| | | # to provide --print_config mode. Instead of it, do as |
| | | group.add_argument( |
| | | "--token_list", |
| | | type=str_or_none, |
| | | default=None, |
| | | help="A text mapping int-id to token", |
| | | ) |
| | | group.add_argument( |
| | | "--init", |
| | | type=lambda x: str_or_none(x.lower()), |
| | | default=None, |
| | | help="The initialization method", |
| | | choices=[ |
| | | "chainer", |
| | | "xavier_uniform", |
| | | "xavier_normal", |
| | | "kaiming_uniform", |
| | | "kaiming_normal", |
| | | None, |
| | | ], |
| | | ) |
| | | |
| | | group.add_argument( |
| | | "--input_size", |
| | | type=int_or_none, |
| | | default=None, |
| | | help="The number of input dimension of the feature", |
| | | ) |
| | | |
| | | group = parser.add_argument_group(description="Preprocess related") |
| | | group.add_argument( |
| | | "--use_preprocessor", |
| | | type=str2bool, |
| | | default=True, |
| | | help="Apply preprocessing to data or not", |
| | | ) |
| | | group.add_argument( |
| | | "--token_type", |
| | | type=str, |
| | | default=None, |
| | | choices=["bpe", "char", "word", "phn"], |
| | | help="The text will be tokenized " "in the specified level token", |
| | | ) |
| | | |
| | | group.add_argument( |
| | | "--feats_type", |
| | | type=str, |
| | | default='fbank', |
| | | help="feats type, e.g. fbank, wav, ark_wav(needed to be scale normalization)", |
| | | ) |
| | | |
| | | group.add_argument( |
| | | "--bpemodel", |
| | | type=str_or_none, |
| | | default=None, |
| | | help="The model file of sentencepiece", |
| | | ) |
| | | parser.add_argument( |
| | | "--non_linguistic_symbols", |
| | | type=str_or_none, |
| | | help="non_linguistic_symbols file path", |
| | | ) |
| | | parser.add_argument( |
| | | "--cleaner", |
| | | type=str_or_none, |
| | | choices=[None, "tacotron", "jaconv", "vietnamese"], |
| | | default=None, |
| | | help="Apply text cleaning", |
| | | ) |
| | | parser.add_argument( |
| | | "--g2p", |
| | | type=str_or_none, |
| | | choices=g2p_choices, |
| | | default=None, |
| | | help="Specify g2p method if --token_type=phn", |
| | | ) |
| | | parser.add_argument( |
| | | "--speech_volume_normalize", |
| | | type=float_or_none, |
| | | default=None, |
| | | help="Scale the maximum amplitude to the given value.", |
| | | ) |
| | | parser.add_argument( |
| | | "--rir_scp", |
| | | type=str_or_none, |
| | | default=None, |
| | | help="The file path of rir scp file.", |
| | | ) |
| | | parser.add_argument( |
| | | "--rir_apply_prob", |
| | | type=float, |
| | | default=1.0, |
| | | help="THe probability for applying RIR convolution.", |
| | | ) |
| | | parser.add_argument( |
| | | "--noise_scp", |
| | | type=str_or_none, |
| | | default=None, |
| | | help="The file path of noise scp file.", |
| | | ) |
| | | parser.add_argument( |
| | | "--noise_apply_prob", |
| | | type=float, |
| | | default=1.0, |
| | | help="The probability applying Noise adding.", |
| | | ) |
| | | parser.add_argument( |
| | | "--noise_db_range", |
| | | type=str, |
| | | default="13_15", |
| | | help="The range of noise decibel level.", |
| | | ) |
| | | parser.add_argument( |
| | | "--pred_masked_weight", |
| | | type=float, |
| | | default=1.0, |
| | | help="weight for predictive loss for masked frames", |
| | | ) |
| | | parser.add_argument( |
| | | "--pred_nomask_weight", |
| | | type=float, |
| | | default=0.0, |
| | | help="weight for predictive loss for unmasked frames", |
| | | ) |
| | | parser.add_argument( |
| | | "--loss_weights", |
| | | type=float, |
| | | default=0.0, |
| | | help="weights for additional loss terms (not first one)", |
| | | ) |
| | | |
| | | for class_choices in cls.class_choices_list: |
| | | # Append --<name> and --<name>_conf. |
| | | # e.g. --encoder and --encoder_conf |
| | | class_choices.add_arguments(group) |
| | | |
| | | @classmethod |
| | | def build_collate_fn( |
| | | cls, args: argparse.Namespace, train: bool |
| | | ) -> Callable[ |
| | | [Collection[Tuple[str, Dict[str, np.ndarray]]]], |
| | | Tuple[List[str], Dict[str, torch.Tensor]], |
| | | ]: |
| | | assert check_argument_types() |
| | | return CommonCollateFn(clipping=True) |
| | | |
| | | @classmethod |
| | | def build_preprocess_fn( |
| | | cls, args: argparse.Namespace, train: bool |
| | | ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]: |
| | | assert check_argument_types() |
| | | if args.use_preprocessor: |
| | | retval = CommonPreprocessor( |
| | | train=train, |
| | | bpemodel=args.bpemodel, |
| | | non_linguistic_symbols=args.non_linguistic_symbols, |
| | | text_cleaner=args.cleaner, |
| | | g2p_type=args.g2p, |
| | | # NOTE(kamo): Check attribute existence for backward compatibility |
| | | rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None, |
| | | rir_apply_prob=args.rir_apply_prob |
| | | if hasattr(args, "rir_apply_prob") |
| | | else 1.0, |
| | | noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None, |
| | | noise_apply_prob=args.noise_apply_prob |
| | | if hasattr(args, "noise_apply_prob") |
| | | else 1.0, |
| | | noise_db_range=args.noise_db_range |
| | | if hasattr(args, "noise_db_range") |
| | | else "13_15", |
| | | speech_volume_normalize=args.speech_volume_normalize |
| | | if hasattr(args, "rir_scp") |
| | | else None, |
| | | ) |
| | | else: |
| | | retval = None |
| | | assert check_return_type(retval) |
| | | return retval |
| | | |
| | | @classmethod |
| | | def required_data_names( |
| | | cls, train: bool = True, inference: bool = False |
| | | ) -> Tuple[str, ...]: |
| | | # for pre-training |
| | | retval = ("speech",) |
| | | return retval |
| | | |
| | | @classmethod |
| | | def optional_data_names( |
| | | cls, train: bool = True, inference: bool = False |
| | | ) -> Tuple[str, ...]: |
| | | retval = () |
| | | assert check_return_type(retval) |
| | | return retval |
| | | |
| | | @classmethod |
| | | def build_model(cls, args: argparse.Namespace): |
| | | assert check_argument_types() |
| | | |
| | | # 1. frontend |
| | | if args.input_size is None: |
| | | # Extract features in the model |
| | | frontend_class = frontend_choices.get_class(args.frontend) |
| | | frontend = frontend_class(**args.frontend_conf) |
| | | input_size = frontend.output_size() |
| | | else: |
| | | # Give features from data-loader |
| | | args.frontend = None |
| | | args.frontend_conf = {} |
| | | frontend = None |
| | | input_size = args.input_size |
| | | |
| | | # 2. Data augmentation for spectrogram |
| | | if args.specaug is not None: |
| | | specaug_class = specaug_choices.get_class(args.specaug) |
| | | specaug = specaug_class(**args.specaug_conf) |
| | | else: |
| | | specaug = None |
| | | |
| | | # 3. Normalization layer |
| | | if args.normalize is not None: |
| | | normalize_class = normalize_choices.get_class(args.normalize) |
| | | normalize = normalize_class(**args.normalize_conf) |
| | | else: |
| | | normalize = None |
| | | |
| | | # 4. Pre-encoder input block |
| | | # NOTE(kan-bayashi): Use getattr to keep the compatibility |
| | | if getattr(args, "preencoder", None) is not None: |
| | | preencoder_class = preencoder_choices.get_class(args.preencoder) |
| | | preencoder = preencoder_class(**args.preencoder_conf) |
| | | input_size = preencoder.output_size() |
| | | else: |
| | | preencoder = None |
| | | |
| | | # 5. Encoder |
| | | encoder_class = encoder_choices.get_class(args.encoder) |
| | | encoder = encoder_class( |
| | | input_size=input_size, |
| | | **args.encoder_conf, |
| | | ) |
| | | |
| | | # 6. Build model |
| | | try: |
| | | model_class = model_choices.get_class(args.model) |
| | | except AttributeError: |
| | | model_class = model_choices.get_class("data2vec") |
| | | model = model_class( |
| | | frontend=frontend, |
| | | specaug=specaug, |
| | | normalize=normalize, |
| | | preencoder=preencoder, |
| | | encoder=encoder, |
| | | ) |
| | | |
| | | # 7. Initialize |
| | | if args.init is not None: |
| | | initialize(model, args.init) |
| | | |
| | | assert check_return_type(model) |
| | | return model |