Merge branch 'main' into dev_cmz2
| New file |
| | |
| | | import argparse |
| | | import json |
| | | import numpy as np |
| | | |
| | | |
| | | def get_parser(): |
| | | parser = argparse.ArgumentParser( |
| | | description="cmvn converter", |
| | | formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| | | ) |
| | | parser.add_argument( |
| | | "--cmvn-json", |
| | | "-c", |
| | | default=False, |
| | | required=True, |
| | | type=str, |
| | | help="cmvn json file", |
| | | ) |
| | | parser.add_argument( |
| | | "--am-mvn", |
| | | "-a", |
| | | default=False, |
| | | required=True, |
| | | type=str, |
| | | help="am mvn file", |
| | | ) |
| | | return parser |
| | | |
| | | def main(): |
| | | parser = get_parser() |
| | | args = parser.parse_args() |
| | | |
| | | with open(args.cmvn_json, "r") as fin: |
| | | cmvn_dict = json.load(fin) |
| | | |
| | | mean_stats = np.array(cmvn_dict["mean_stats"]) |
| | | var_stats = np.array(cmvn_dict["var_stats"]) |
| | | total_frame = np.array(cmvn_dict["total_frames"]) |
| | | |
| | | mean = -1.0 * mean_stats / total_frame |
| | | var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean) |
| | | dims = mean.shape[0] |
| | | with open(args.am_mvn, 'w') as fout: |
| | | fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n") |
| | | mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]') |
| | | fout.write("<LearnRateCoef> 0 " + mean_str + '\n') |
| | | fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n') |
| | | var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]') |
| | | fout.write("<LearnRateCoef> 0 " + var_str + '\n') |
| | | fout.write("</Nnet>" + '\n') |
| | | |
| | | if __name__ == '__main__': |
| | | main() |
| | |
| | | if out_item['wrong'] > 0: |
| | | rst['wrong_sentences'] += 1 |
| | | cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n') |
| | | cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n') |
| | | cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n') |
| | | cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n') |
| | | cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n') |
| | | |
| | | if rst['Wrd'] > 0: |
| | | rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2) |
| New file |
| | |
| | | beam_size: 10 |
| | | penalty: 0.0 |
| | | maxlenratio: 0.0 |
| | | minlenratio: 0.0 |
| | | ctc_weight: 0.5 |
| | | lm_weight: 0.7 |
| New file |
| | |
| | | encoder: conformer |
| | | encoder_conf: |
| | | output_size: 512 |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 12 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | attention_dropout_rate: 0.1 |
| | | input_layer: conv2d |
| | | normalize_before: true |
| | | macaron_style: true |
| | | rel_pos_type: latest |
| | | pos_enc_layer_type: rel_pos |
| | | selfattention_layer_type: rel_selfattn |
| | | activation_type: swish |
| | | use_cnn_module: true |
| | | cnn_module_kernel: 31 |
| | | |
| | | decoder: transformer |
| | | decoder_conf: |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 6 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | self_attention_dropout_rate: 0.1 |
| | | src_attention_dropout_rate: 0.1 |
| | | |
| | | model_conf: |
| | | ctc_weight: 0.3 |
| | | lsm_weight: 0.1 |
| | | length_normalized_loss: false |
| | | |
| | | accum_grad: 2 |
| | | max_epoch: 50 |
| | | patience: none |
| | | init: none |
| | | best_model_criterion: |
| | | - - valid |
| | | - acc |
| | | - max |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.0025 |
| | | weight_decay: 0.000001 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 40000 |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | | apply_time_warp: true |
| | | time_warp_window: 5 |
| | | time_warp_mode: bicubic |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 27 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_ratio_range: |
| | | - 0. |
| | | - 0.05 |
| | | num_time_mask: 10 |
| | | |
| | | dataset_conf: |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 1024 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 10000 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |
| New file |
| | |
| | | encoder: conformer |
| | | encoder_conf: |
| | | output_size: 512 |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 12 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | attention_dropout_rate: 0.1 |
| | | input_layer: conv2d |
| | | normalize_before: true |
| | | macaron_style: true |
| | | rel_pos_type: latest |
| | | pos_enc_layer_type: rel_pos |
| | | selfattention_layer_type: rel_selfattn |
| | | activation_type: swish |
| | | use_cnn_module: true |
| | | cnn_module_kernel: 31 |
| | | |
| | | decoder: transformer |
| | | decoder_conf: |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 6 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | self_attention_dropout_rate: 0.1 |
| | | src_attention_dropout_rate: 0.1 |
| | | |
| | | model_conf: |
| | | ctc_weight: 0.3 |
| | | lsm_weight: 0.1 |
| | | length_normalized_loss: false |
| | | |
| | | accum_grad: 2 |
| | | max_epoch: 50 |
| | | patience: none |
| | | init: none |
| | | best_model_criterion: |
| | | - - valid |
| | | - acc |
| | | - max |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.0025 |
| | | weight_decay: 0.000001 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 40000 |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | | apply_time_warp: true |
| | | time_warp_window: 5 |
| | | time_warp_mode: bicubic |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 27 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_ratio_range: |
| | | - 0. |
| | | - 0.05 |
| | | num_time_mask: 10 |
| | | |
| | | dataset_conf: |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 1024 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 10000 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |
| | | normalize: utterance_mvn |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2014 Vassil Panayotov |
| | | # 2014 Johns Hopkins University (author: Daniel Povey) |
| | | # Apache 2.0 |
| | | |
| | | if [ "$#" -ne 2 ]; then |
| | | echo "Usage: $0 <src-dir> <dst-dir>" |
| | | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" |
| | | exit 1 |
| | | fi |
| | | |
| | | src=$1 |
| | | dst=$2 |
| | | |
| | | # all utterances are FLAC compressed |
| | | if ! which flac >&/dev/null; then |
| | | echo "Please install 'flac' on ALL worker nodes!" |
| | | exit 1 |
| | | fi |
| | | |
| | | spk_file=$src/../SPEAKERS.TXT |
| | | |
| | | mkdir -p $dst || exit 1 |
| | | |
| | | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1 |
| | | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 |
| | | |
| | | |
| | | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp |
| | | trans=$dst/text; [[ -f "$trans" ]] && rm $trans |
| | | |
| | | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | reader=$(basename $reader_dir) |
| | | if ! [ $reader -eq $reader ]; then # not integer. |
| | | echo "$0: unexpected subdirectory name $reader" |
| | | exit 1 |
| | | fi |
| | | |
| | | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | chapter=$(basename $chapter_dir) |
| | | if ! [ "$chapter" -eq "$chapter" ]; then |
| | | echo "$0: unexpected chapter-subdirectory name $chapter" |
| | | exit 1 |
| | | fi |
| | | |
| | | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ |
| | | awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1 |
| | | |
| | | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt |
| | | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 |
| | | cat $chapter_trans >>$trans |
| | | done |
| | | done |
| | | |
| | | echo "$0: successfully prepared data in $dst" |
| | | |
| | | exit 0 |
| New file |
| | |
| | | export FUNASR_DIR=$PWD/../../.. |
| | | |
| | | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C |
| | | export PYTHONIOENCODING=UTF-8 |
| | | export PATH=$FUNASR_DIR/funasr/bin:$PATH |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | # machines configuration |
| | | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" |
| | | gpu_num=8 |
| | | count=1 |
| | | gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding |
| | | # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob |
| | | njob=5 |
| | | train_cmd=utils/run.pl |
| | | infer_cmd=utils/run.pl |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary |
| | | exp_dir="." |
| | | lang=en |
| | | dumpdir=dump/fbank |
| | | feats_type=fbank |
| | | token_type=bpe |
| | | dataset_type=large |
| | | scp=feats.scp |
| | | type=kaldi_ark |
| | | stage=3 |
| | | stop_stage=4 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | sample_frequency=16000 |
| | | nj=100 |
| | | speed_perturb="0.9,1.0,1.1" |
| | | |
| | | # data |
| | | data_librispeech= |
| | | |
| | | # bpe model |
| | | nbpe=5000 |
| | | bpemode=unigram |
| | | |
| | | # exp tag |
| | | tag="" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | train_set=train_960 |
| | | valid_set=dev |
| | | test_sets="test_clean test_other dev_clean dev_other" |
| | | |
| | | asr_config=conf/train_asr_conformer.yaml |
| | | #asr_config=conf/train_asr_conformer_uttnorm.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}" |
| | | |
| | | inference_config=conf/decode_asr_transformer.yaml |
| | | #inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml |
| | | inference_asr_model=valid.acc.ave_10best.pth |
| | | |
| | | # you can set gpu num for decoding here |
| | | gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default |
| | | ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | |
| | | if ${gpu_inference}; then |
| | | inference_nj=$[${ngpu}*${njob}] |
| | | _ngpu=1 |
| | | else |
| | | inference_nj=$njob |
| | | _ngpu=0 |
| | | fi |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | | # Data preparation |
| | | for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do |
| | | local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_} |
| | | done |
| | | fi |
| | | |
| | | feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir} |
| | | feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir} |
| | | feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir} |
| | | feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir} |
| | | feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir} |
| | | feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir} |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | echo "stage 1: Feature Generation" |
| | | # compute fbank features |
| | | fbankdir=${feats_dir}/fbank |
| | | for x in dev_clean dev_other test_clean test_other; do |
| | | utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \ |
| | | ${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x} |
| | | utils/fix_data_feat.sh ${fbankdir}/${x} |
| | | done |
| | | |
| | | mkdir ${feats_dir}/data/$train_set |
| | | train_sets="train_clean_100 train_clean_360 train_other_500" |
| | | for file in wav.scp text; do |
| | | ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1; |
| | | done |
| | | utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \ |
| | | ${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set |
| | | utils/fix_data_feat.sh ${fbankdir}/$train_set |
| | | |
| | | # compute global cmvn |
| | | utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \ |
| | | ${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set |
| | | |
| | | # apply cmvn |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ |
| | | ${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \ |
| | | ${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\ |
| | | ${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \ |
| | | ${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir} |
| | | utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \ |
| | | ${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir} |
| | | |
| | | cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir} |
| | | cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir} |
| | | cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir} |
| | | cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir} |
| | | cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir} |
| | | |
| | | dev_sets="dev_clean dev_other" |
| | | for file in feats.scp text speech_shape text_shape; do |
| | | ( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1; |
| | | done |
| | | |
| | | #generate ark list |
| | | utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir} |
| | | utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir} |
| | | fi |
| | | |
| | | dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt |
| | | bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe} |
| | | echo "dictionary: ${dict}" |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | ### Task dependent. You have to check non-linguistic symbols used in the corpus. |
| | | echo "stage 2: Dictionary and Json Data Preparation" |
| | | mkdir -p ${feats_dir}/data/lang_char/ |
| | | echo "<blank>" > ${dict} |
| | | echo "<s>" >> ${dict} |
| | | echo "</s>" >> ${dict} |
| | | cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt |
| | | spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 |
| | | spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict} |
| | | echo "<unk>" >> ${dict} |
| | | wc -l ${dict} |
| | | |
| | | vocab_size=$(cat ${dict} | wc -l) |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char |
| | | awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set |
| | | mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set |
| | | cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set |
| | | cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set |
| | | fi |
| | | |
| | | |
| | | # Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | echo "stage 3: Training" |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | mkdir -p ${exp_dir}/exp/${model_dir}/log |
| | | INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init |
| | | if [ -f $INIT_FILE ];then |
| | | rm -f $INIT_FILE |
| | | fi |
| | | init_method=file://$(readlink -f $INIT_FILE) |
| | | echo "$0: init method is $init_method" |
| | | for ((i = 0; i < $gpu_num; ++i)); do |
| | | { |
| | | rank=$i |
| | | local_rank=$i |
| | | gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
| | | asr_train.py \ |
| | | --gpu_id $gpu_id \ |
| | | --use_preprocessor true \ |
| | | --split_with_space false \ |
| | | --bpemodel ${bpemodel}.model \ |
| | | --token_type $token_type \ |
| | | --dataset_type $dataset_type \ |
| | | --token_list $dict \ |
| | | --train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \ |
| | | --valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \ |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --input_size $feats_dim \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| | | --dist_init_method $init_method \ |
| | | --dist_world_size $world_size \ |
| | | --dist_rank $rank \ |
| | | --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | | |
| | | # Testing Stage |
| | | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
| | | echo "stage 4: Inference" |
| | | for dset in ${test_sets}; do |
| | | asr_exp=${exp_dir}/exp/${model_dir} |
| | | inference_tag="$(basename "${inference_config}" .yaml)" |
| | | _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" |
| | | _logdir="${_dir}/logdir" |
| | | if [ -d ${_dir} ]; then |
| | | echo "${_dir} is already exists. if you want to decode again, please delete this dir first." |
| | | exit 0 |
| | | fi |
| | | mkdir -p "${_logdir}" |
| | | _data="${feats_dir}/${dumpdir}/${dset}" |
| | | key_file=${_data}/${scp} |
| | | num_scp_file="$(<${key_file} wc -l)" |
| | | _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") |
| | | split_scps= |
| | | for n in $(seq "${_nj}"); do |
| | | split_scps+=" ${_logdir}/keys.${n}.scp" |
| | | done |
| | | # shellcheck disable=SC2086 |
| | | utils/split_scp.pl "${key_file}" ${split_scps} |
| | | _opts= |
| | | if [ -n "${inference_config}" ]; then |
| | | _opts+="--config ${inference_config} " |
| | | fi |
| | | ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ |
| | | python -m funasr.bin.asr_inference_launch \ |
| | | --batch_size 1 \ |
| | | --ngpu "${_ngpu}" \ |
| | | --njob ${njob} \ |
| | | --gpuid_list ${gpuid_list} \ |
| | | --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \ |
| | | --key_file "${_logdir}"/keys.JOB.scp \ |
| | | --asr_train_config "${asr_exp}"/config.yaml \ |
| | | --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ |
| | | --output_dir "${_logdir}"/output.JOB \ |
| | | --mode asr \ |
| | | ${_opts} |
| | | |
| | | for f in token token_int score text; do |
| | | if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${_nj}"); do |
| | | cat "${_logdir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${_dir}/${f}" |
| | | fi |
| | | done |
| | | python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer |
| | | tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt |
| | | cat ${_dir}/text.cer.txt |
| | | done |
| | | fi |
| New file |
| | |
| | | ../../aishell/transformer/utils |
| | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | text_proc_file = os.path.join(best_recog_path, "text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | text_proc_file = os.path.join(best_recog_path, "text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | if __name__ == "__main__": |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | cp ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | if __name__ == "__main__": |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | cp ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | text_proc_file = os.path.join(best_recog_path, "text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | text_proc_file = os.path.join(best_recog_path, "text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | |
| | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/token") |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | |
| | | finish_count += 1 |
| | | # asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | |
| | | logging.info("decoding, utt: {}, predictions: {}".format(key, text)) |
| | | rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)) |
| | |
| | | from funasr.models.frontend.wav_frontend import WavFrontend |
| | | from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer |
| | | from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export |
| | | np.set_printoptions(threshold=np.inf) |
| | | |
| | | class Speech2Text: |
| | | """Speech2Text class |
| | |
| | | # Input as audio signal |
| | | if isinstance(speech, np.ndarray): |
| | | speech = torch.tensor(speech) |
| | | |
| | | if self.frontend is not None: |
| | | feats, feats_len = self.frontend.forward(speech, speech_lengths) |
| | | feats = to_device(feats, device=self.device) |
| | |
| | | feats = speech |
| | | feats_len = speech_lengths |
| | | lfr_factor = max(1, (feats.size()[-1] // 80) - 1) |
| | | feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"] |
| | | feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:] |
| | | feats_len = torch.tensor([feats_len]) |
| | | batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache} |
| | | |
| | | # a. To device |
| | | batch = to_device(batch, device=self.device) |
| | | |
| | | # b. Forward Encoder |
| | | enc, enc_len = self.asr_model.encode_chunk(**batch) |
| | | enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache) |
| | | if isinstance(enc, tuple): |
| | | enc = enc[0] |
| | | # assert len(enc) == 1, len(enc) |
| | |
| | | else: |
| | | speech2text = Speech2Text(**speech2text_kwargs) |
| | | |
| | | def _load_bytes(input): |
| | | middle_data = np.frombuffer(input, dtype=np.int16) |
| | | middle_data = np.asarray(middle_data) |
| | | if middle_data.dtype.kind not in 'iu': |
| | | raise TypeError("'middle_data' must be an array of integers") |
| | | dtype = np.dtype('float32') |
| | | if dtype.kind != 'f': |
| | | raise TypeError("'dtype' must be a floating point type") |
| | | |
| | | i = np.iinfo(middle_data.dtype) |
| | | abs_max = 2 ** (i.bits - 1) |
| | | offset = i.min + abs_max |
| | | array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32) |
| | | return array |
| | | |
| | | def _forward( |
| | | data_path_and_name_and_type, |
| | | raw_inputs: Union[np.ndarray, torch.Tensor] = None, |
| | |
| | | ): |
| | | |
| | | # 3. Build data-iterator |
| | | if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes": |
| | | raw_inputs = _load_bytes(data_path_and_name_and_type[0]) |
| | | raw_inputs = torch.tensor(raw_inputs) |
| | | if data_path_and_name_and_type is None and raw_inputs is not None: |
| | | if isinstance(raw_inputs, np.ndarray): |
| | | raw_inputs = torch.tensor(raw_inputs) |
| | | |
| | | is_final = False |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | |
| | | asr_result = "" |
| | | wait = True |
| | | if len(cache) == 0: |
| | | cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None} |
| | | cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0} |
| | | cache_de = {"decode_fsmn": None} |
| | | cache["decoder"] = cache_de |
| | | cache["first_chunk"] = True |
| | | cache["speech"] = [] |
| | | cache["chunk_index"] = 0 |
| | | cache["speech_chunk"] = [] |
| | | cache["accum_speech"] = 0 |
| | | |
| | | if raw_inputs is not None: |
| | | if len(cache["speech"]) == 0: |
| | | cache["speech"] = raw_inputs |
| | | else: |
| | | cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0) |
| | | if len(cache["speech_chunk"]) == 0: |
| | | cache["speech_chunk"] = raw_inputs |
| | | else: |
| | | cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0) |
| | | while len(cache["speech_chunk"]) >= 960: |
| | | cache["accum_speech"] += len(raw_inputs) |
| | | while cache["accum_speech"] >= 960: |
| | | if cache["first_chunk"]: |
| | | if len(cache["speech_chunk"]) >= 14400: |
| | | speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0) |
| | | speech_length = torch.tensor([14400]) |
| | | if cache["accum_speech"] >= 14400: |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["left"] = 5 |
| | | cache["encoder"]["right"] = 0 |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"]= cache["speech_chunk"][4800:] |
| | | cache["accum_speech"] -= 4800 |
| | | cache["first_chunk"] = False |
| | | cache["encoder"]["start_idx"] = -5 |
| | | cache["encoder"]["is_final"] = False |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["stride"] = len(cache["speech"]) // 960 |
| | | cache["encoder"]["pad_left"] = 0 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | cache["accum_speech"] = 0 |
| | | wait = False |
| | | else: |
| | | break |
| | | else: |
| | | if len(cache["speech_chunk"]) >= 19200: |
| | | if cache["accum_speech"] >= 19200: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0) |
| | | speech_length = torch.tensor([19200]) |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = cache["speech_chunk"][9600:] |
| | | cache["accum_speech"] -= 9600 |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | cache["encoder"]["is_final"] = True |
| | | if cache["accum_speech"] >= 14400: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = 10 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 5 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | cache["accum_speech"] -= 9600 |
| | | wait = False |
| | | else: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | cache["encoder"]["left"] = 0 |
| | | cache["encoder"]["right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["accum_speech"] = 0 |
| | | wait = False |
| | | else: |
| | | break |
| | |
| | | ibest_writer["token"][key] = " ".join(token) |
| | | ibest_writer["token_int"][key] = " ".join(map(str, token_int)) |
| | | ibest_writer["vad"][key] = "{}".format(vadsegments) |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | ibest_writer["text_with_punc"][key] = text_postprocessed_punc |
| | | if time_stamp_postprocessed is not None: |
| | | ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed) |
| | |
| | | ibest_writer["token"][key] = " ".join(token) |
| | | ibest_writer["token_int"][key] = " ".join(map(str, token_int)) |
| | | ibest_writer["vad"][key] = "{}".format(vadsegments) |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | ibest_writer["text_with_punc"][key] = text_postprocessed_punc |
| | | if time_stamp_postprocessed is not None: |
| | | ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed) |
| | |
| | | ibest_writer["rtf"][key] = rtf_cur |
| | | |
| | | if text is not None: |
| | | text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) |
| | | text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token) |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | | # asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | |
| | | logging.info("decoding, utt: {}, predictions: {}".format(key, text)) |
| | | rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)) |
| | |
| | | ibest_writer["score"][key] = str(hyp.score) |
| | | |
| | | if text is not None: |
| | | text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) |
| | | text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token) |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | | asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | ibest_writer["score"][key] = str(hyp.score) |
| | | |
| | | if text is not None: |
| | | text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) |
| | | text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token) |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | | asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | ibest_writer["text"][key] = " ".join(word_lists) |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | vad = -2 |
| | | |
| | | if bpe_tokenizer is not None: |
| | | text = bpe_tokenizer.text2tokens(text) |
| | | text = bpe_tokenizer.text2tokens("".join(text)) |
| | | |
| | | if seg_dict is not None: |
| | | assert isinstance(seg_dict, dict) |
| | |
| | | self, |
| | | cache_dir: Union[Path, str] = None, |
| | | onnx: bool = True, |
| | | device: str = "cpu", |
| | | quant: bool = True, |
| | | fallback_num: int = 0, |
| | | audio_in: str = None, |
| | |
| | | ) |
| | | print("output dir: {}".format(self.cache_dir)) |
| | | self.onnx = onnx |
| | | self.device = device |
| | | self.quant = quant |
| | | self.fallback_num = fallback_num |
| | | self.frontend = None |
| | |
| | | dummy_input = model.get_dummy_inputs(enc_size) |
| | | else: |
| | | dummy_input = model.get_dummy_inputs() |
| | | |
| | | if self.device == 'cuda': |
| | | model = model.cuda() |
| | | dummy_input = tuple([i.cuda() for i in dummy_input]) |
| | | |
| | | # model_script = torch.jit.script(model) |
| | | model_script = torch.jit.trace(model, dummy_input) |
| | |
| | | parser.add_argument('--model-name', type=str, required=True) |
| | | parser.add_argument('--export-dir', type=str, required=True) |
| | | parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]') |
| | | parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]') |
| | | parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model') |
| | | parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number') |
| | | parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]') |
| | |
| | | export_model = ModelExport( |
| | | cache_dir=args.export_dir, |
| | | onnx=args.type == 'onnx', |
| | | device=args.device, |
| | | quant=args.quantize, |
| | | fallback_num=args.fallback_num, |
| | | audio_in=args.audio_in, |
| | |
| | | return x, cache |
| | | |
| | | |
| | | torch_version = float(".".join(torch.__version__.split(".")[:2])) |
| | | if torch_version >= 1.8: |
| | | torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]]) |
| | | if torch_version >= (1, 8): |
| | | import torch.fx |
| | | torch.fx.wrap('preprocess_for_attn') |
| | | |
| | |
| | | return x, tgt_mask, x_self_attn, x_src_attn |
| | | |
| | | |
| | | class ContexutalBiasDecoder(nn.Module): |
| | | class ContextualBiasDecoder(nn.Module): |
| | | def __init__( |
| | | self, |
| | | size, |
| | |
| | | normalize_before=True, |
| | | ): |
| | | """Construct an DecoderLayer object.""" |
| | | super(ContexutalBiasDecoder, self).__init__() |
| | | super(ContextualBiasDecoder, self).__init__() |
| | | self.size = size |
| | | self.src_attn = src_attn |
| | | if src_attn is not None: |
| | |
| | | ), |
| | | ) |
| | | self.dropout = nn.Dropout(dropout_rate) |
| | | self.bias_decoder = ContexutalBiasDecoder( |
| | | self.bias_decoder = ContextualBiasDecoder( |
| | | size=attention_dim, |
| | | src_attn=MultiHeadedAttentionCrossAtt( |
| | | attention_heads, attention_dim, src_attention_dropout_rate |
| | |
| | | |
| | | x = residual + self.dropout(self.src_attn(x, memory, memory_mask)) |
| | | |
| | | |
| | | return x, tgt_mask, memory, memory_mask, cache |
| | | |
| | | def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None): |
| | |
| | | for i in range(self.att_layer_num): |
| | | decoder = self.decoders[i] |
| | | c = cache[i] |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder( |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk( |
| | | x, tgt_mask, memory, memory_mask, cache=c |
| | | ) |
| | | new_cache.append(c_ret) |
| | |
| | | j = i + self.att_layer_num |
| | | decoder = self.decoders2[i] |
| | | c = cache[j] |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder( |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk( |
| | | x, tgt_mask, memory, memory_mask, cache=c |
| | | ) |
| | | new_cache.append(c_ret) |
| | | |
| | | for decoder in self.decoders3: |
| | | x, tgt_mask, memory, memory_mask, _ = decoder( |
| | | x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk( |
| | | x, tgt_mask, memory, None, cache=None |
| | | ) |
| | | |
| | |
| | | for i in range(self.att_layer_num): |
| | | decoder = self.decoders[i] |
| | | c = cache[i] |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder( |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk( |
| | | x, tgt_mask, memory, None, cache=c |
| | | ) |
| | | new_cache.append(c_ret) |
| | |
| | | j = i + self.att_layer_num |
| | | decoder = self.decoders2[i] |
| | | c = cache[j] |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder( |
| | | x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk( |
| | | x, tgt_mask, memory, None, cache=c |
| | | ) |
| | | new_cache.append(c_ret) |
| | | |
| | | for decoder in self.decoders3: |
| | | |
| | | x, tgt_mask, memory, memory_mask, _ = decoder( |
| | | x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk( |
| | | x, tgt_mask, memory, None, cache=None |
| | | ) |
| | | |
| | |
| | | encoder_out, encoder_out_lens |
| | | ) |
| | | |
| | | assert encoder_out.size(0) == speech.size(0), ( |
| | | encoder_out.size(), |
| | | speech.size(0), |
| | | ) |
| | | assert encoder_out.size(1) <= encoder_out_lens.max(), ( |
| | | encoder_out.size(), |
| | | encoder_out_lens.max(), |
| | | ) |
| | | |
| | | if intermediate_outs is not None: |
| | | return (encoder_out, intermediate_outs), encoder_out_lens |
| | | |
| | | return encoder_out, encoder_out_lens |
| | | return encoder_out, torch.tensor([encoder_out.size(1)]) |
| | | |
| | | def calc_predictor(self, encoder_out, encoder_out_lens): |
| | | |
| | |
| | | |
| | | # 1. Encoder |
| | | encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) |
| | | intermediate_outs = None |
| | | if isinstance(encoder_out, tuple): |
| | | intermediate_outs = encoder_out[1] |
| | | encoder_out = encoder_out[0] |
| | | |
| | | loss_att, acc_att, cer_att, wer_att = None, None, None, None |
| | | loss_ctc, cer_ctc = None, None |
| | | loss_pre = None |
| | | stats = dict() |
| | | |
| | | # 1. CTC branch |
| | | if self.ctc_weight != 0.0: |
| | | loss_ctc, cer_ctc = self._calc_ctc_loss( |
| | | encoder_out, encoder_out_lens, text, text_lengths |
| | | ) |
| | | |
| | | # Collect CTC branch stats |
| | | stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None |
| | | stats["cer_ctc"] = cer_ctc |
| | | |
| | | # Intermediate CTC (optional) |
| | | loss_interctc = 0.0 |
| | | if self.interctc_weight != 0.0 and intermediate_outs is not None: |
| | | for layer_idx, intermediate_out in intermediate_outs: |
| | | # we assume intermediate_out has the same length & padding |
| | | # as those of encoder_out |
| | | loss_ic, cer_ic = self._calc_ctc_loss( |
| | | intermediate_out, encoder_out_lens, text, text_lengths |
| | | ) |
| | | loss_interctc = loss_interctc + loss_ic |
| | | |
| | | # Collect Intermedaite CTC stats |
| | | stats["loss_interctc_layer{}".format(layer_idx)] = ( |
| | | loss_ic.detach() if loss_ic is not None else None |
| | | ) |
| | | stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic |
| | | |
| | | loss_interctc = loss_interctc / len(intermediate_outs) |
| | | |
| | | # calculate whole encoder loss |
| | | loss_ctc = ( |
| | | 1 - self.interctc_weight |
| | | ) * loss_ctc + self.interctc_weight * loss_interctc |
| | | |
| | | # 2b. Attention decoder branch |
| | | if self.ctc_weight != 1.0: |
| | | loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss( |
| | | encoder_out, encoder_out_lens, text, text_lengths |
| | | ) |
| | | |
| | | loss_pre2 = self._calc_pre2_loss( |
| | | encoder_out, encoder_out_lens, text, text_lengths |
| | | ) |
| | | |
| | | loss = loss_pre2 |
| | | # 3. CTC-Att loss definition |
| | | if self.ctc_weight == 0.0: |
| | | loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5 |
| | | elif self.ctc_weight == 1.0: |
| | | loss = loss_ctc |
| | | else: |
| | | loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5 |
| | | |
| | | # Collect Attn branch stats |
| | | stats["loss_att"] = loss_att.detach() if loss_att is not None else None |
| | | stats["acc"] = acc_att |
| | | stats["cer"] = cer_att |
| | | stats["wer"] = wer_att |
| | | stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None |
| | | stats["loss_pre2"] = loss_pre2.detach().cpu() |
| | | |
| | | stats["loss"] = torch.clone(loss.detach()) |
| | | |
| | | # force_gatherable: to-device and to-tensor if scalar for DataParallel |
| | |
| | | inner_dim: int = 256, |
| | | bias_encoder_type: str = 'lstm', |
| | | label_bracket: bool = False, |
| | | use_decoder_embedding: bool = False, |
| | | ): |
| | | assert check_argument_types() |
| | | assert 0.0 <= ctc_weight <= 1.0, ctc_weight |
| | |
| | | self.hotword_buffer = None |
| | | self.length_record = [] |
| | | self.current_buffer_length = 0 |
| | | self.use_decoder_embedding = use_decoder_embedding |
| | | |
| | | def forward( |
| | | self, |
| | |
| | | hw_list.append(hw_tokens) |
| | | # padding |
| | | hw_list_pad = pad_list(hw_list, 0) |
| | | if self.use_decoder_embedding: |
| | | hw_embed = self.decoder.embed(hw_list_pad) |
| | | else: |
| | | hw_embed = self.bias_embed(hw_list_pad) |
| | | hw_embed, (_, _) = self.bias_encoder(hw_embed) |
| | | _ind = np.arange(0, len(hw_list)).tolist() |
| | | # update self.hotword_buffer, throw a part if oversize |
| | |
| | | # default hotword list |
| | | hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list |
| | | hw_list_pad = pad_list(hw_list, 0) |
| | | if self.use_decoder_embedding: |
| | | hw_embed = self.decoder.embed(hw_list_pad) |
| | | else: |
| | | hw_embed = self.bias_embed(hw_list_pad) |
| | | _, (h_n, _) = self.bias_encoder(hw_embed) |
| | | contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1) |
| | | else: |
| | | hw_lengths = [len(i) for i in hw_list] |
| | | hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device) |
| | | if self.use_decoder_embedding: |
| | | hw_embed = self.decoder.embed(hw_list_pad) |
| | | else: |
| | | hw_embed = self.bias_embed(hw_list_pad) |
| | | hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True, |
| | | enforce_sorted=False) |
| | |
| | | return acoustic_embeds, token_num, alphas, cif_peak
|
| | |
|
| | | def forward_chunk(self, hidden, cache=None):
|
| | | b, t, d = hidden.size()
|
| | | h = hidden
|
| | | context = h.transpose(1, 2)
|
| | | queries = self.pad(context)
|
| | |
| | | alphas = alphas * mask_chunk_predictor
|
| | |
|
| | | if cache is not None:
|
| | | if cache["is_final"]:
|
| | | alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
|
| | | if cache["cif_hidden"] is not None:
|
| | | hidden = torch.cat((cache["cif_hidden"], hidden), 1)
|
| | | if cache["cif_alphas"] is not None:
|
| | |
| | | pre_alphas_length = cache["cif_alphas"].size(-1)
|
| | | mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
|
| | | mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
|
| | | |
| | |
|
| | | if mask_chunk_peak_predictor is not None:
|
| | | cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
|
| | |
| | | |
| | | import math |
| | | import torch |
| | | |
| | | import torch.nn.functional as F |
| | | |
| | | def _pre_hook( |
| | | state_dict, |
| | |
| | | |
| | | def forward_chunk(self, x, cache=None): |
| | | start_idx = 0 |
| | | pad_left = 0 |
| | | pad_right = 0 |
| | | batch_size, timesteps, input_dim = x.size() |
| | | if cache is not None: |
| | | start_idx = cache["start_idx"] |
| | | pad_left = cache["left"] |
| | | pad_right = cache["right"] |
| | | positions = torch.arange(1, timesteps+start_idx+1)[None, :] |
| | | position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device) |
| | | return x + position_encoding[:, start_idx: start_idx + timesteps] |
| | | outputs = x + position_encoding[:, start_idx: start_idx + timesteps] |
| | | outputs = outputs.transpose(1,2) |
| | | outputs = F.pad(outputs, (pad_left, pad_right)) |
| | | outputs = outputs.transpose(1,2) |
| | | return outputs |
| | | |
| | |
| | | python grpc_main_client_mic.py --host $server_ip --port 10108 |
| | | ``` |
| | | |
| | | The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example: |
| | | |
| | | ``` |
| | | # go to ../python/grpc to find this package |
| | | import paraformer_pb2 |
| | | |
| | | |
| | | class RecognizeStub: |
| | | def __init__(self, channel): |
| | | self.Recognize = channel.stream_stream( |
| | | '/paraformer.ASR/Recognize', |
| | | request_serializer=paraformer_pb2.Request.SerializeToString, |
| | | response_deserializer=paraformer_pb2.Response.FromString, |
| | | ) |
| | | |
| | | |
| | | async def send(channel, data, speaking, isEnd): |
| | | stub = RecognizeStub(channel) |
| | | req = paraformer_pb2.Request() |
| | | if data: |
| | | req.audio_data = data |
| | | req.user = 'zz' |
| | | req.language = 'zh-CN' |
| | | req.speaking = speaking |
| | | req.isEnd = isEnd |
| | | q = queue.SimpleQueue() |
| | | q.put(req) |
| | | return stub.Recognize(iter(q.get, None)) |
| | | |
| | | # send the audio data once |
| | | async def grpc_rec(data, grpc_uri): |
| | | with grpc.insecure_channel(grpc_uri) as channel: |
| | | b = time.time() |
| | | response = await send(channel, data, False, False) |
| | | resp = response.next() |
| | | text = '' |
| | | if 'decoding' == resp.action: |
| | | resp = response.next() |
| | | if 'finish' == resp.action: |
| | | text = json.loads(resp.sentence)['text'] |
| | | response = await send(channel, None, False, True) |
| | | return { |
| | | 'text': text, |
| | | 'time': time.time() - b, |
| | | } |
| | | |
| | | async def test(): |
| | | # fc = FunAsrGrpcClient('127.0.0.1', 9900) |
| | | # t = await fc.rec(wav.tobytes()) |
| | | # print(t) |
| | | wav, _ = sf.read('z-10s.wav', dtype='int16') |
| | | uri = '127.0.0.1:9900' |
| | | res = await grpc_rec(wav.tobytes(), uri) |
| | | print(res) |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | asyncio.run(test()) |
| | | |
| | | ``` |
| | | |
| | | |
| | | ## Acknowledge |
| | | 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR). |
| | | 2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service. |
| | |
| | | res.set_language(req.language()); |
| | | stream->Write(res); |
| | | } else if (!req.speaking()) { |
| | | if (client_buffers.count(req.user()) == 0) { |
| | | if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) { |
| | | Response res; |
| | | res.set_sentence( |
| | | R"({"success": true, "detail": "waiting_for_voice"})" |
| | |
| | | stream->Write(res); |
| | | }else { |
| | | auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); |
| | | if (req.audio_data().size() > 0) { |
| | | auto& buf = client_buffers[req.user()]; |
| | | buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end()); |
| | | } |
| | | std::string tmp_data = this->client_buffers[req.user()]; |
| | | this->clear_states(req.user()); |
| | | |
| | |
| | | else: |
| | | asr_result = "" |
| | | elif self.backend == "onnxruntime": |
| | | from rapid_paraformer.utils.frontend import load_bytes |
| | | from funasr_onnx.utils.frontend import load_bytes |
| | | array = load_bytes(tmp_data) |
| | | asr_result = self.inference_16k_pipeline(array)[0] |
| | | end_time = int(round(time.time() * 1000)) |
| | |
| | | |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/funasr_torch |
| | | cd funasr/runtime/python/libtorch |
| | | python setup.py build |
| | | python setup.py install |
| | | ``` |
| | |
| | | |
| | | from funasr_torch import Paraformer |
| | | |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model = Paraformer(model_dir, batch_size=1) |
| | | |
| | | wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'] |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | |
| | | model = Paraformer(model_dir, batch_size=1) # cpu |
| | | # model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu |
| | | |
| | | # when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps |
| | | # model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png") |
| | | |
| | | wav_path = "YourPath/xx.wav" |
| | | |
| | | result = model(wav_path) |
| | | print(result) |
| | |
| | | ) |
| | | self.ort_infer = torch.jit.load(model_file) |
| | | self.batch_size = batch_size |
| | | self.device_id = device_id |
| | | self.plot_timestamp_to = plot_timestamp_to |
| | | self.pred_bias = pred_bias |
| | | |
| | |
| | | end_idx = min(waveform_nums, beg_idx + self.batch_size) |
| | | feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx]) |
| | | try: |
| | | with torch.no_grad(): |
| | | if int(self.device_id) == -1: |
| | | outputs = self.ort_infer(feats, feats_len) |
| | | am_scores, valid_token_lens = outputs[0], outputs[1] |
| | | else: |
| | | outputs = self.ort_infer(feats.cuda(), feats_len.cuda()) |
| | | am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu() |
| | | if len(outputs) == 4: |
| | | # for BiCifParaformer Inference |
| | | us_alphas, us_peaks = outputs[2], outputs[3] |
| | |
| | | |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/funasr_onnx |
| | | cd funasr/runtime/python/onnxruntime |
| | | python setup.py build |
| | | python setup.py install |
| | | ``` |
| | |
| | | |
| | | from funasr_onnx import Paraformer |
| | | |
| | | model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch" |
| | | |
| | | # if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0 |
| | | # plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | |
| | | wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav" |
| | | model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) # cpu |
| | | # model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0) # gpu |
| | | |
| | | # when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps |
| | | # model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png") |
| | | |
| | | wav_path = "YourPath/xx.wav" |
| | | |
| | | result = model(wav_path) |
| | | print(result) |
| | |
| | | default=sys.maxsize, |
| | | help="The maximum number update step to train", |
| | | ) |
| | | parser.add_argument( |
| | | "--batch_interval", |
| | | type=int, |
| | | default=10000, |
| | | help="The batch interval for saving model.", |
| | | ) |
| | | group.add_argument( |
| | | "--patience", |
| | | type=int_or_none, |
| | |
| | | from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader |
| | | train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="train") |
| | | valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="eval") |
| | | elif args.dataset_type == "small": |
| | | train_iter_factory = cls.build_iter_factory( |
| | |
| | | ) -> AbsIterFactory: |
| | | assert check_argument_types() |
| | | |
| | | if args.frontend_conf is not None and "fs" in args.frontend_conf: |
| | | dest_sample_rate = args.frontend_conf["fs"] |
| | | else: |
| | | dest_sample_rate = 16000 |
| | | |
| | | dataset = ESPnetDataset( |
| | | iter_options.data_path_and_name_and_type, |
| | | float_dtype=args.train_dtype, |
| | | preprocess=iter_options.preprocess_fn, |
| | | max_cache_size=iter_options.max_cache_size, |
| | | max_cache_fd=iter_options.max_cache_fd, |
| | | dest_sample_rate=args.frontend_conf["fs"], |
| | | dest_sample_rate=dest_sample_rate, |
| | | ) |
| | | cls.check_task_requirements( |
| | | dataset, args.allow_variable_data_keys, train=iter_options.train |
| | |
| | | default="13_15", |
| | | help="The range of noise decibel level.", |
| | | ) |
| | | parser.add_argument( |
| | | "--batch_interval", |
| | | type=int, |
| | | default=10000, |
| | | help="The batch interval for saving model.", |
| | | ) |
| | | |
| | | for class_choices in cls.class_choices_list: |
| | | # Append --<name> and --<name>_conf. |
| | |
| | | ): |
| | | assert isinstance(batch, dict), type(batch) |
| | | |
| | | if rank == 0 and hasattr(model.module, "num_updates"): |
| | | num_batch_updates = model.module.get_num_updates() |
| | | if rank == 0: |
| | | if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")): |
| | | num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates() |
| | | if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai: |
| | | buffer = BytesIO() |
| | | torch.save(model.state_dict(), buffer) |
| | |
| | | if out_item['wrong'] > 0: |
| | | rst['wrong_sentences'] += 1 |
| | | cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n') |
| | | cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n') |
| | | cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n') |
| | | cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n') |
| | | cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n') |
| | | |
| | | if rst['Wrd'] > 0: |
| | | rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2) |