Merge pull request #807 from alibaba-damo-academy/dev_wjm
update e_branchformer
| | |
| | | stop_stage=3 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | nj=64 |
| | | |
| | | # data |
| | |
| | | valid_set=dev_ios |
| | | |
| | | asr_config=conf/train_pretrain_transformer.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml) _${lang}_${token_type}_${tag}" |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --input_size $feats_dim \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| New file |
| | |
| | | beam_size: 10 |
| | | penalty: 0.0 |
| | | maxlenratio: 0.0 |
| | | minlenratio: 0.0 |
| | | ctc_weight: 0.3 |
| | | lm_weight: 0.0 |
| New file |
| | |
| | | # network architecture |
| | | # encoder related |
| | | encoder: branchformer |
| | | encoder_conf: |
| | | output_size: 512 |
| | | use_attn: true |
| | | attention_heads: 8 |
| | | attention_layer_type: rel_selfattn |
| | | pos_enc_layer_type: rel_pos |
| | | rel_pos_type: latest |
| | | use_cgmlp: true |
| | | cgmlp_linear_units: 3072 |
| | | cgmlp_conv_kernel: 31 |
| | | use_linear_after_conv: false |
| | | gate_activation: identity |
| | | merge_method: concat |
| | | cgmlp_weight: 0.5 # used only if merge_method is "fixed_ave" |
| | | attn_branch_drop_rate: 0.0 # used only if merge_method is "learned_ave" |
| | | num_blocks: 18 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | attention_dropout_rate: 0.1 |
| | | input_layer: conv2d |
| | | stochastic_depth_rate: 0.0 |
| | | |
| | | # decoder related |
| | | decoder: transformer |
| | | decoder_conf: |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 6 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | self_attention_dropout_rate: 0.1 |
| | | src_attention_dropout_rate: 0.1 |
| | | |
| | | # frontend related |
| | | frontend: wav_frontend |
| | | frontend_conf: |
| | | fs: 16000 |
| | | window: hamming |
| | | n_mels: 80 |
| | | frame_length: 25 |
| | | frame_shift: 10 |
| | | lfr_m: 1 |
| | | lfr_n: 1 |
| | | |
| | | # hybrid CTC/attention |
| | | model_conf: |
| | | ctc_weight: 0.3 |
| | | lsm_weight: 0.1 # label smoothing option |
| | | length_normalized_loss: false |
| | | |
| | | # optimization related |
| | | accum_grad: 2 |
| | | grad_clip: 5 |
| | | max_epoch: 210 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - acc |
| | | best_model_criterion: |
| | | - - valid |
| | | - acc |
| | | - max |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.0025 |
| | | weight_decay: 0.000001 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 40000 |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | | apply_time_warp: true |
| | | time_warp_window: 5 |
| | | time_warp_mode: bicubic |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 27 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_ratio_range: |
| | | - 0. |
| | | - 0.05 |
| | | num_time_mask: 10 |
| | | |
| | | dataset_conf: |
| | | data_names: speech,text |
| | | data_types: sound,text |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 2048 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 30000 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2014 Vassil Panayotov |
| | | # 2014 Johns Hopkins University (author: Daniel Povey) |
| | | # Apache 2.0 |
| | | |
| | | if [ "$#" -ne 2 ]; then |
| | | echo "Usage: $0 <src-dir> <dst-dir>" |
| | | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" |
| | | exit 1 |
| | | fi |
| | | |
| | | src=$1 |
| | | dst=$2 |
| | | |
| | | # all utterances are FLAC compressed |
| | | if ! which flac >&/dev/null; then |
| | | echo "Please install 'flac' on ALL worker nodes!" |
| | | exit 1 |
| | | fi |
| | | |
| | | spk_file=$src/../SPEAKERS.TXT |
| | | |
| | | mkdir -p $dst || exit 1 |
| | | |
| | | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1 |
| | | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 |
| | | |
| | | |
| | | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp |
| | | trans=$dst/text; [[ -f "$trans" ]] && rm $trans |
| | | |
| | | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | reader=$(basename $reader_dir) |
| | | if ! [ $reader -eq $reader ]; then # not integer. |
| | | echo "$0: unexpected subdirectory name $reader" |
| | | exit 1 |
| | | fi |
| | | |
| | | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | chapter=$(basename $chapter_dir) |
| | | if ! [ "$chapter" -eq "$chapter" ]; then |
| | | echo "$0: unexpected chapter-subdirectory name $chapter" |
| | | exit 1 |
| | | fi |
| | | |
| | | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ |
| | | awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1 |
| | | |
| | | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt |
| | | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 |
| | | cat $chapter_trans >>$trans |
| | | done |
| | | done |
| | | |
| | | echo "$0: successfully prepared data in $dst" |
| | | |
| | | exit 0 |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) |
| | | # Apache 2.0 |
| | | |
| | | remove_archive=false |
| | | |
| | | if [ "$1" == --remove-archive ]; then |
| | | remove_archive=true |
| | | shift |
| | | fi |
| | | |
| | | if [ $# -ne 3 ]; then |
| | | echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" |
| | | echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" |
| | | echo "With --remove-archive it will remove the archive after successfully un-tarring it." |
| | | echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other," |
| | | echo " train-clean-100, train-clean-360, train-other-500." |
| | | exit 1 |
| | | fi |
| | | |
| | | data=$1 |
| | | url=$2 |
| | | part=$3 |
| | | |
| | | if [ ! -d "$data" ]; then |
| | | echo "$0: no such directory $data" |
| | | exit 1 |
| | | fi |
| | | |
| | | part_ok=false |
| | | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" |
| | | for x in $list; do |
| | | if [ "$part" == $x ]; then part_ok=true; fi |
| | | done |
| | | if ! $part_ok; then |
| | | echo "$0: expected <corpus-part> to be one of $list, but got '$part'" |
| | | exit 1 |
| | | fi |
| | | |
| | | if [ -z "$url" ]; then |
| | | echo "$0: empty URL base." |
| | | exit 1 |
| | | fi |
| | | |
| | | if [ -f $data/LibriSpeech/$part/.complete ]; then |
| | | echo "$0: data part $part was already successfully extracted, nothing to do." |
| | | exit 0 |
| | | fi |
| | | |
| | | |
| | | # sizes of the archive files in bytes. This is some older versions. |
| | | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" |
| | | # sizes_new is the archive file sizes of the final release. Some of these sizes are of |
| | | # things we probably won't download. |
| | | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" |
| | | |
| | | if [ -f $data/$part.tar.gz ]; then |
| | | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') |
| | | size_ok=false |
| | | for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done |
| | | if ! $size_ok; then |
| | | echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" |
| | | echo "does not equal the size of one of the archives." |
| | | rm $data/$part.tar.gz |
| | | else |
| | | echo "$data/$part.tar.gz exists and appears to be complete." |
| | | fi |
| | | fi |
| | | |
| | | if [ ! -f $data/$part.tar.gz ]; then |
| | | if ! which wget >/dev/null; then |
| | | echo "$0: wget is not installed." |
| | | exit 1 |
| | | fi |
| | | full_url=$url/$part.tar.gz |
| | | echo "$0: downloading data from $full_url. This may take some time, please be patient." |
| | | |
| | | if ! wget -P $data --no-check-certificate $full_url; then |
| | | echo "$0: error executing wget $full_url" |
| | | exit 1 |
| | | fi |
| | | fi |
| | | |
| | | if ! tar -C $data -xvzf $data/$part.tar.gz; then |
| | | echo "$0: error un-tarring archive $data/$part.tar.gz" |
| | | exit 1 |
| | | fi |
| | | |
| | | touch $data/LibriSpeech/$part/.complete |
| | | |
| | | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" |
| | | |
| | | if $remove_archive; then |
| | | echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." |
| | | rm $data/$part.tar.gz |
| | | fi |
| New file |
| | |
| | | #!/usr/bin/env python |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # All rights reserved. |
| | | # |
| | | # This source code is licensed under the license found in |
| | | # https://github.com/pytorch/fairseq/blob/master/LICENSE |
| | | |
| | | |
| | | import argparse |
| | | import contextlib |
| | | import sys |
| | | |
| | | import sentencepiece as spm |
| | | |
| | | |
| | | def main(): |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument("--model", required=True, |
| | | help="sentencepiece model to use for encoding") |
| | | parser.add_argument("--inputs", nargs="+", default=['-'], |
| | | help="input files to filter/encode") |
| | | parser.add_argument("--outputs", nargs="+", default=['-'], |
| | | help="path to save encoded outputs") |
| | | parser.add_argument("--output_format", choices=["piece", "id"], default="piece") |
| | | parser.add_argument("--min-len", type=int, metavar="N", |
| | | help="filter sentence pairs with fewer than N tokens") |
| | | parser.add_argument("--max-len", type=int, metavar="N", |
| | | help="filter sentence pairs with more than N tokens") |
| | | args = parser.parse_args() |
| | | |
| | | assert len(args.inputs) == len(args.outputs), \ |
| | | "number of input and output paths should match" |
| | | |
| | | sp = spm.SentencePieceProcessor() |
| | | sp.Load(args.model) |
| | | |
| | | if args.output_format == "piece": |
| | | def encode(l): |
| | | return sp.EncodeAsPieces(l) |
| | | elif args.output_format == "id": |
| | | def encode(l): |
| | | return list(map(str, sp.EncodeAsIds(l))) |
| | | else: |
| | | raise NotImplementedError |
| | | |
| | | if args.min_len is not None or args.max_len is not None: |
| | | def valid(line): |
| | | return ( |
| | | (args.min_len is None or len(line) >= args.min_len) and |
| | | (args.max_len is None or len(line) <= args.max_len) |
| | | ) |
| | | else: |
| | | def valid(lines): |
| | | return True |
| | | |
| | | with contextlib.ExitStack() as stack: |
| | | inputs = [ |
| | | stack.enter_context(open(input, "r", encoding="utf-8")) |
| | | if input != "-" else sys.stdin |
| | | for input in args.inputs |
| | | ] |
| | | outputs = [ |
| | | stack.enter_context(open(output, "w", encoding="utf-8")) |
| | | if output != "-" else sys.stdout |
| | | for output in args.outputs |
| | | ] |
| | | |
| | | stats = { |
| | | "num_empty": 0, |
| | | "num_filtered": 0, |
| | | } |
| | | |
| | | def encode_line(line): |
| | | line = line.strip() |
| | | if len(line) > 0: |
| | | line = encode(line) |
| | | if valid(line): |
| | | return line |
| | | else: |
| | | stats["num_filtered"] += 1 |
| | | else: |
| | | stats["num_empty"] += 1 |
| | | return None |
| | | |
| | | for i, lines in enumerate(zip(*inputs), start=1): |
| | | enc_lines = list(map(encode_line, lines)) |
| | | if not any(enc_line is None for enc_line in enc_lines): |
| | | for enc_line, output_h in zip(enc_lines, outputs): |
| | | print(" ".join(enc_line), file=output_h) |
| | | if i % 10000 == 0: |
| | | print("processed {} lines".format(i), file=sys.stderr) |
| | | |
| | | print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) |
| | | print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) |
| | | |
| | | |
| | | if __name__ == "__main__": |
| | | main() |
| New file |
| | |
| | | #!/usr/bin/env python3 |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # All rights reserved. |
| | | # |
| | | # This source code is licensed under the license found in the |
| | | # https://github.com/pytorch/fairseq/blob/master/LICENSE |
| | | import sys |
| | | |
| | | import sentencepiece as spm |
| | | |
| | | if __name__ == "__main__": |
| | | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) |
| New file |
| | |
| | | export FUNASR_DIR=$PWD/../../.. |
| | | |
| | | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C |
| | | export PYTHONIOENCODING=UTF-8 |
| | | export PATH=$FUNASR_DIR/funasr/bin:$PATH |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | # machines configuration |
| | | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" |
| | | gpu_num=8 |
| | | count=1 |
| | | gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding |
| | | # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob |
| | | njob=5 |
| | | train_cmd=utils/run.pl |
| | | infer_cmd=utils/run.pl |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary |
| | | exp_dir="." |
| | | lang=en |
| | | token_type=bpe |
| | | type=sound |
| | | scp=wav.scp |
| | | speed_perturb="0.9 1.0 1.1" |
| | | stage=0 |
| | | stop_stage=5 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | nj=64 |
| | | |
| | | # data |
| | | raw_data= |
| | | data_url=www.openslr.org/resources/12 |
| | | |
| | | # bpe model |
| | | nbpe=5000 |
| | | bpemode=unigram |
| | | |
| | | # exp tag |
| | | tag="exp1" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | train_set=train_960 |
| | | valid_set=dev |
| | | test_sets="test_clean test_other dev_clean dev_other" |
| | | |
| | | asr_config=conf/train_asr_branchformer.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" |
| | | |
| | | inference_config=conf/decode_asr_transformer_beam10_ctc0.3.yaml |
| | | inference_asr_model=valid.acc.ave_10best.pb |
| | | |
| | | # you can set gpu num for decoding here |
| | | gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default |
| | | ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | |
| | | if ${gpu_inference}; then |
| | | inference_nj=$[${ngpu}*${njob}] |
| | | _ngpu=1 |
| | | else |
| | | inference_nj=$njob |
| | | _ngpu=0 |
| | | fi |
| | | |
| | | |
| | | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then |
| | | echo "stage -1: Data Download" |
| | | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do |
| | | local/download_and_untar.sh ${raw_data} ${data_url} ${part} |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | | # Data preparation |
| | | for x in dev-clean dev-other test-clean test-other train-clean-100 train-clean-360 train-other-500; do |
| | | local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_} |
| | | done |
| | | mkdir $feats_dir/data/$valid_set |
| | | dev_sets="dev_clean dev_other" |
| | | for file in wav.scp text; do |
| | | ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1; |
| | | done |
| | | mkdir $feats_dir/data/$train_set |
| | | train_sets="train_clean_100 train_clean_360 train_other_500" |
| | | for file in wav.scp text; do |
| | | ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1; |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | echo "stage 1: Feature and CMVN Generation" |
| | | utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0 |
| | | fi |
| | | |
| | | token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt |
| | | bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe} |
| | | echo "dictionary: ${token_list}" |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | ### Task dependent. You have to check non-linguistic symbols used in the corpus. |
| | | echo "stage 2: Dictionary and Json Data Preparation" |
| | | mkdir -p ${feats_dir}/data/lang_char/ |
| | | echo "<blank>" > ${token_list} |
| | | echo "<s>" >> ${token_list} |
| | | echo "</s>" >> ${token_list} |
| | | cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt |
| | | local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 |
| | | local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list} |
| | | echo "<unk>" >> ${token_list} |
| | | fi |
| | | |
| | | # LM Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | echo "stage 3: LM Training" |
| | | fi |
| | | |
| | | # ASR Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
| | | echo "stage 4: ASR Training" |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | mkdir -p ${exp_dir}/exp/${model_dir}/log |
| | | INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init |
| | | if [ -f $INIT_FILE ];then |
| | | rm -f $INIT_FILE |
| | | fi |
| | | init_method=file://$(readlink -f $INIT_FILE) |
| | | echo "$0: init method is $init_method" |
| | | for ((i = 0; i < $gpu_num; ++i)); do |
| | | { |
| | | rank=$i |
| | | local_rank=$i |
| | | gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
| | | train.py \ |
| | | --task_name asr \ |
| | | --gpu_id $gpu_id \ |
| | | --use_preprocessor true \ |
| | | --split_with_space false \ |
| | | --bpemodel ${bpemodel}.model \ |
| | | --token_type $token_type \ |
| | | --token_list $token_list \ |
| | | --dataset_type large \ |
| | | --data_dir ${feats_dir}/data \ |
| | | --train_set ${train_set} \ |
| | | --valid_set ${valid_set} \ |
| | | --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ |
| | | --speed_perturb ${speed_perturb} \ |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| | | --dist_init_method $init_method \ |
| | | --dist_world_size $world_size \ |
| | | --dist_rank $rank \ |
| | | --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | | |
| | | # Testing Stage |
| | | if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then |
| | | echo "stage 5: Inference" |
| | | for dset in ${test_sets}; do |
| | | asr_exp=${exp_dir}/exp/${model_dir} |
| | | inference_tag="$(basename "${inference_config}" .yaml)" |
| | | _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" |
| | | _logdir="${_dir}/logdir" |
| | | if [ -d ${_dir} ]; then |
| | | echo "${_dir} is already exists. if you want to decode again, please delete this dir first." |
| | | exit 0 |
| | | fi |
| | | mkdir -p "${_logdir}" |
| | | _data="${feats_dir}/data/${dset}" |
| | | key_file=${_data}/${scp} |
| | | num_scp_file="$(<${key_file} wc -l)" |
| | | _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") |
| | | split_scps= |
| | | for n in $(seq "${_nj}"); do |
| | | split_scps+=" ${_logdir}/keys.${n}.scp" |
| | | done |
| | | # shellcheck disable=SC2086 |
| | | utils/split_scp.pl "${key_file}" ${split_scps} |
| | | _opts= |
| | | if [ -n "${inference_config}" ]; then |
| | | _opts+="--config ${inference_config} " |
| | | fi |
| | | ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ |
| | | python -m funasr.bin.asr_inference_launch \ |
| | | --batch_size 1 \ |
| | | --ngpu "${_ngpu}" \ |
| | | --njob ${njob} \ |
| | | --gpuid_list ${gpuid_list} \ |
| | | --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \ |
| | | --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ |
| | | --key_file "${_logdir}"/keys.JOB.scp \ |
| | | --asr_train_config "${asr_exp}"/config.yaml \ |
| | | --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ |
| | | --output_dir "${_logdir}"/output.JOB \ |
| | | --mode asr \ |
| | | ${_opts} |
| | | |
| | | for f in token token_int score text; do |
| | | if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${_nj}"); do |
| | | cat "${_logdir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${_dir}/${f}" |
| | | fi |
| | | done |
| | | python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer |
| | | tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt |
| | | cat ${_dir}/text.cer.txt |
| | | done |
| | | fi |
| New file |
| | |
| | | ../../aishell/transformer/utils |
| New file |
| | |
| | | beam_size: 10 |
| | | penalty: 0.0 |
| | | maxlenratio: 0.0 |
| | | minlenratio: 0.0 |
| | | ctc_weight: 0.3 |
| | | lm_weight: 0.0 |
| New file |
| | |
| | | # network architecture |
| | | # encoder related |
| | | encoder: e_branchformer |
| | | encoder_conf: |
| | | output_size: 512 |
| | | attention_heads: 8 |
| | | attention_layer_type: rel_selfattn |
| | | pos_enc_layer_type: rel_pos |
| | | rel_pos_type: latest |
| | | cgmlp_linear_units: 3072 |
| | | cgmlp_conv_kernel: 31 |
| | | use_linear_after_conv: false |
| | | gate_activation: identity |
| | | num_blocks: 17 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | attention_dropout_rate: 0.1 |
| | | input_layer: conv2d |
| | | layer_drop_rate: 0.1 |
| | | linear_units: 1024 |
| | | positionwise_layer_type: linear |
| | | macaron_ffn: true |
| | | use_ffn: true |
| | | merge_conv_kernel: 31 |
| | | |
| | | # decoder related |
| | | decoder: transformer |
| | | decoder_conf: |
| | | attention_heads: 8 |
| | | linear_units: 2048 |
| | | num_blocks: 6 |
| | | dropout_rate: 0.1 |
| | | positional_dropout_rate: 0.1 |
| | | self_attention_dropout_rate: 0.1 |
| | | src_attention_dropout_rate: 0.1 |
| | | layer_drop_rate: 0.2 |
| | | |
| | | # frontend related |
| | | frontend: wav_frontend |
| | | frontend_conf: |
| | | fs: 16000 |
| | | window: hamming |
| | | n_mels: 80 |
| | | frame_length: 25 |
| | | frame_shift: 10 |
| | | lfr_m: 1 |
| | | lfr_n: 1 |
| | | |
| | | # hybrid CTC/attention |
| | | model_conf: |
| | | ctc_weight: 0.3 |
| | | lsm_weight: 0.1 # label smoothing option |
| | | length_normalized_loss: false |
| | | |
| | | # optimization related |
| | | accum_grad: 2 |
| | | grad_clip: 5 |
| | | max_epoch: 240 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - acc |
| | | best_model_criterion: |
| | | - - valid |
| | | - acc |
| | | - max |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.002 |
| | | weight_decay: 0.000001 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 40000 |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | | apply_time_warp: true |
| | | time_warp_window: 5 |
| | | time_warp_mode: bicubic |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 27 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_ratio_range: |
| | | - 0. |
| | | - 0.05 |
| | | num_time_mask: 10 |
| | | |
| | | dataset_conf: |
| | | data_names: speech,text |
| | | data_types: sound,text |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 2048 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 30000 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2014 Vassil Panayotov |
| | | # 2014 Johns Hopkins University (author: Daniel Povey) |
| | | # Apache 2.0 |
| | | |
| | | if [ "$#" -ne 2 ]; then |
| | | echo "Usage: $0 <src-dir> <dst-dir>" |
| | | echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" |
| | | exit 1 |
| | | fi |
| | | |
| | | src=$1 |
| | | dst=$2 |
| | | |
| | | # all utterances are FLAC compressed |
| | | if ! which flac >&/dev/null; then |
| | | echo "Please install 'flac' on ALL worker nodes!" |
| | | exit 1 |
| | | fi |
| | | |
| | | spk_file=$src/../SPEAKERS.TXT |
| | | |
| | | mkdir -p $dst || exit 1 |
| | | |
| | | [ ! -d $src ] && echo "$0: no such directory $src" && exit 1 |
| | | [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 |
| | | |
| | | |
| | | wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp |
| | | trans=$dst/text; [[ -f "$trans" ]] && rm $trans |
| | | |
| | | for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | reader=$(basename $reader_dir) |
| | | if ! [ $reader -eq $reader ]; then # not integer. |
| | | echo "$0: unexpected subdirectory name $reader" |
| | | exit 1 |
| | | fi |
| | | |
| | | for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do |
| | | chapter=$(basename $chapter_dir) |
| | | if ! [ "$chapter" -eq "$chapter" ]; then |
| | | echo "$0: unexpected chapter-subdirectory name $chapter" |
| | | exit 1 |
| | | fi |
| | | |
| | | find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ |
| | | awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1 |
| | | |
| | | chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt |
| | | [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 |
| | | cat $chapter_trans >>$trans |
| | | done |
| | | done |
| | | |
| | | echo "$0: successfully prepared data in $dst" |
| | | |
| | | exit 0 |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2014 Johns Hopkins University (author: Daniel Povey) |
| | | # Apache 2.0 |
| | | |
| | | remove_archive=false |
| | | |
| | | if [ "$1" == --remove-archive ]; then |
| | | remove_archive=true |
| | | shift |
| | | fi |
| | | |
| | | if [ $# -ne 3 ]; then |
| | | echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" |
| | | echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" |
| | | echo "With --remove-archive it will remove the archive after successfully un-tarring it." |
| | | echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other," |
| | | echo " train-clean-100, train-clean-360, train-other-500." |
| | | exit 1 |
| | | fi |
| | | |
| | | data=$1 |
| | | url=$2 |
| | | part=$3 |
| | | |
| | | if [ ! -d "$data" ]; then |
| | | echo "$0: no such directory $data" |
| | | exit 1 |
| | | fi |
| | | |
| | | part_ok=false |
| | | list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" |
| | | for x in $list; do |
| | | if [ "$part" == $x ]; then part_ok=true; fi |
| | | done |
| | | if ! $part_ok; then |
| | | echo "$0: expected <corpus-part> to be one of $list, but got '$part'" |
| | | exit 1 |
| | | fi |
| | | |
| | | if [ -z "$url" ]; then |
| | | echo "$0: empty URL base." |
| | | exit 1 |
| | | fi |
| | | |
| | | if [ -f $data/LibriSpeech/$part/.complete ]; then |
| | | echo "$0: data part $part was already successfully extracted, nothing to do." |
| | | exit 0 |
| | | fi |
| | | |
| | | |
| | | # sizes of the archive files in bytes. This is some older versions. |
| | | sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" |
| | | # sizes_new is the archive file sizes of the final release. Some of these sizes are of |
| | | # things we probably won't download. |
| | | sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" |
| | | |
| | | if [ -f $data/$part.tar.gz ]; then |
| | | size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') |
| | | size_ok=false |
| | | for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done |
| | | if ! $size_ok; then |
| | | echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" |
| | | echo "does not equal the size of one of the archives." |
| | | rm $data/$part.tar.gz |
| | | else |
| | | echo "$data/$part.tar.gz exists and appears to be complete." |
| | | fi |
| | | fi |
| | | |
| | | if [ ! -f $data/$part.tar.gz ]; then |
| | | if ! which wget >/dev/null; then |
| | | echo "$0: wget is not installed." |
| | | exit 1 |
| | | fi |
| | | full_url=$url/$part.tar.gz |
| | | echo "$0: downloading data from $full_url. This may take some time, please be patient." |
| | | |
| | | if ! wget -P $data --no-check-certificate $full_url; then |
| | | echo "$0: error executing wget $full_url" |
| | | exit 1 |
| | | fi |
| | | fi |
| | | |
| | | if ! tar -C $data -xvzf $data/$part.tar.gz; then |
| | | echo "$0: error un-tarring archive $data/$part.tar.gz" |
| | | exit 1 |
| | | fi |
| | | |
| | | touch $data/LibriSpeech/$part/.complete |
| | | |
| | | echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" |
| | | |
| | | if $remove_archive; then |
| | | echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." |
| | | rm $data/$part.tar.gz |
| | | fi |
| New file |
| | |
| | | #!/usr/bin/env python |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # All rights reserved. |
| | | # |
| | | # This source code is licensed under the license found in |
| | | # https://github.com/pytorch/fairseq/blob/master/LICENSE |
| | | |
| | | |
| | | import argparse |
| | | import contextlib |
| | | import sys |
| | | |
| | | import sentencepiece as spm |
| | | |
| | | |
| | | def main(): |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument("--model", required=True, |
| | | help="sentencepiece model to use for encoding") |
| | | parser.add_argument("--inputs", nargs="+", default=['-'], |
| | | help="input files to filter/encode") |
| | | parser.add_argument("--outputs", nargs="+", default=['-'], |
| | | help="path to save encoded outputs") |
| | | parser.add_argument("--output_format", choices=["piece", "id"], default="piece") |
| | | parser.add_argument("--min-len", type=int, metavar="N", |
| | | help="filter sentence pairs with fewer than N tokens") |
| | | parser.add_argument("--max-len", type=int, metavar="N", |
| | | help="filter sentence pairs with more than N tokens") |
| | | args = parser.parse_args() |
| | | |
| | | assert len(args.inputs) == len(args.outputs), \ |
| | | "number of input and output paths should match" |
| | | |
| | | sp = spm.SentencePieceProcessor() |
| | | sp.Load(args.model) |
| | | |
| | | if args.output_format == "piece": |
| | | def encode(l): |
| | | return sp.EncodeAsPieces(l) |
| | | elif args.output_format == "id": |
| | | def encode(l): |
| | | return list(map(str, sp.EncodeAsIds(l))) |
| | | else: |
| | | raise NotImplementedError |
| | | |
| | | if args.min_len is not None or args.max_len is not None: |
| | | def valid(line): |
| | | return ( |
| | | (args.min_len is None or len(line) >= args.min_len) and |
| | | (args.max_len is None or len(line) <= args.max_len) |
| | | ) |
| | | else: |
| | | def valid(lines): |
| | | return True |
| | | |
| | | with contextlib.ExitStack() as stack: |
| | | inputs = [ |
| | | stack.enter_context(open(input, "r", encoding="utf-8")) |
| | | if input != "-" else sys.stdin |
| | | for input in args.inputs |
| | | ] |
| | | outputs = [ |
| | | stack.enter_context(open(output, "w", encoding="utf-8")) |
| | | if output != "-" else sys.stdout |
| | | for output in args.outputs |
| | | ] |
| | | |
| | | stats = { |
| | | "num_empty": 0, |
| | | "num_filtered": 0, |
| | | } |
| | | |
| | | def encode_line(line): |
| | | line = line.strip() |
| | | if len(line) > 0: |
| | | line = encode(line) |
| | | if valid(line): |
| | | return line |
| | | else: |
| | | stats["num_filtered"] += 1 |
| | | else: |
| | | stats["num_empty"] += 1 |
| | | return None |
| | | |
| | | for i, lines in enumerate(zip(*inputs), start=1): |
| | | enc_lines = list(map(encode_line, lines)) |
| | | if not any(enc_line is None for enc_line in enc_lines): |
| | | for enc_line, output_h in zip(enc_lines, outputs): |
| | | print(" ".join(enc_line), file=output_h) |
| | | if i % 10000 == 0: |
| | | print("processed {} lines".format(i), file=sys.stderr) |
| | | |
| | | print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) |
| | | print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) |
| | | |
| | | |
| | | if __name__ == "__main__": |
| | | main() |
| New file |
| | |
| | | #!/usr/bin/env python3 |
| | | # Copyright (c) Facebook, Inc. and its affiliates. |
| | | # All rights reserved. |
| | | # |
| | | # This source code is licensed under the license found in the |
| | | # https://github.com/pytorch/fairseq/blob/master/LICENSE |
| | | import sys |
| | | |
| | | import sentencepiece as spm |
| | | |
| | | if __name__ == "__main__": |
| | | spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) |
| New file |
| | |
| | | export FUNASR_DIR=$PWD/../../.. |
| | | |
| | | # NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C |
| | | export PYTHONIOENCODING=UTF-8 |
| | | export PATH=$FUNASR_DIR/funasr/bin:$PATH |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | . ./path.sh || exit 1; |
| | | |
| | | # machines configuration |
| | | CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" |
| | | gpu_num=8 |
| | | count=1 |
| | | gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding |
| | | # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob |
| | | njob=5 |
| | | train_cmd=utils/run.pl |
| | | infer_cmd=utils/run.pl |
| | | |
| | | # general configuration |
| | | feats_dir="../DATA" #feature output dictionary |
| | | exp_dir="." |
| | | lang=en |
| | | token_type=bpe |
| | | type=sound |
| | | scp=wav.scp |
| | | speed_perturb="0.9 1.0 1.1" |
| | | stage=0 |
| | | stop_stage=5 |
| | | |
| | | # feature configuration |
| | | feats_dim=80 |
| | | nj=64 |
| | | |
| | | # data |
| | | raw_data= |
| | | data_url=www.openslr.org/resources/12 |
| | | |
| | | # bpe model |
| | | nbpe=5000 |
| | | bpemode=unigram |
| | | |
| | | # exp tag |
| | | tag="exp1" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | # Set bash to 'debug' mode, it will exit on : |
| | | # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | train_set=train_960 |
| | | valid_set=dev |
| | | test_sets="test_clean test_other dev_clean dev_other" |
| | | |
| | | asr_config=conf/train_asr_e_branchformer.yaml |
| | | model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" |
| | | |
| | | inference_config=conf/decode_asr_transformer_beam10_ctc0.3.yaml |
| | | inference_asr_model=valid.acc.ave_10best.pb |
| | | |
| | | # you can set gpu num for decoding here |
| | | gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default |
| | | ngpu=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | |
| | | if ${gpu_inference}; then |
| | | inference_nj=$[${ngpu}*${njob}] |
| | | _ngpu=1 |
| | | else |
| | | inference_nj=$njob |
| | | _ngpu=0 |
| | | fi |
| | | |
| | | |
| | | if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then |
| | | echo "stage -1: Data Download" |
| | | for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do |
| | | local/download_and_untar.sh ${raw_data} ${data_url} ${part} |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then |
| | | echo "stage 0: Data preparation" |
| | | # Data preparation |
| | | for x in dev-clean dev-other test-clean test-other train-clean-100 train-clean-360 train-other-500; do |
| | | local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_} |
| | | done |
| | | mkdir $feats_dir/data/$valid_set |
| | | dev_sets="dev_clean dev_other" |
| | | for file in wav.scp text; do |
| | | ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1; |
| | | done |
| | | mkdir $feats_dir/data/$train_set |
| | | train_sets="train_clean_100 train_clean_360 train_other_500" |
| | | for file in wav.scp text; do |
| | | ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1; |
| | | done |
| | | fi |
| | | |
| | | if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then |
| | | echo "stage 1: Feature and CMVN Generation" |
| | | utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0 |
| | | fi |
| | | |
| | | token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt |
| | | bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe} |
| | | echo "dictionary: ${token_list}" |
| | | if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then |
| | | ### Task dependent. You have to check non-linguistic symbols used in the corpus. |
| | | echo "stage 2: Dictionary and Json Data Preparation" |
| | | mkdir -p ${feats_dir}/data/lang_char/ |
| | | echo "<blank>" > ${token_list} |
| | | echo "<s>" >> ${token_list} |
| | | echo "</s>" >> ${token_list} |
| | | cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt |
| | | local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 |
| | | local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list} |
| | | echo "<unk>" >> ${token_list} |
| | | fi |
| | | |
| | | # LM Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then |
| | | echo "stage 3: LM Training" |
| | | fi |
| | | |
| | | # ASR Training Stage |
| | | world_size=$gpu_num # run on one machine |
| | | if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then |
| | | echo "stage 4: ASR Training" |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | mkdir -p ${exp_dir}/exp/${model_dir}/log |
| | | INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init |
| | | if [ -f $INIT_FILE ];then |
| | | rm -f $INIT_FILE |
| | | fi |
| | | init_method=file://$(readlink -f $INIT_FILE) |
| | | echo "$0: init method is $init_method" |
| | | for ((i = 0; i < $gpu_num; ++i)); do |
| | | { |
| | | rank=$i |
| | | local_rank=$i |
| | | gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) |
| | | train.py \ |
| | | --task_name asr \ |
| | | --gpu_id $gpu_id \ |
| | | --use_preprocessor true \ |
| | | --split_with_space false \ |
| | | --bpemodel ${bpemodel}.model \ |
| | | --token_type $token_type \ |
| | | --token_list $token_list \ |
| | | --dataset_type large \ |
| | | --data_dir ${feats_dir}/data \ |
| | | --train_set ${train_set} \ |
| | | --valid_set ${valid_set} \ |
| | | --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ |
| | | --speed_perturb ${speed_perturb} \ |
| | | --resume true \ |
| | | --output_dir ${exp_dir}/exp/${model_dir} \ |
| | | --config $asr_config \ |
| | | --ngpu $gpu_num \ |
| | | --num_worker_count $count \ |
| | | --multiprocessing_distributed true \ |
| | | --dist_init_method $init_method \ |
| | | --dist_world_size $world_size \ |
| | | --dist_rank $rank \ |
| | | --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 |
| | | } & |
| | | done |
| | | wait |
| | | fi |
| | | |
| | | # Testing Stage |
| | | if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then |
| | | echo "stage 5: Inference" |
| | | for dset in ${test_sets}; do |
| | | asr_exp=${exp_dir}/exp/${model_dir} |
| | | inference_tag="$(basename "${inference_config}" .yaml)" |
| | | _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" |
| | | _logdir="${_dir}/logdir" |
| | | if [ -d ${_dir} ]; then |
| | | echo "${_dir} is already exists. if you want to decode again, please delete this dir first." |
| | | exit 0 |
| | | fi |
| | | mkdir -p "${_logdir}" |
| | | _data="${feats_dir}/data/${dset}" |
| | | key_file=${_data}/${scp} |
| | | num_scp_file="$(<${key_file} wc -l)" |
| | | _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") |
| | | split_scps= |
| | | for n in $(seq "${_nj}"); do |
| | | split_scps+=" ${_logdir}/keys.${n}.scp" |
| | | done |
| | | # shellcheck disable=SC2086 |
| | | utils/split_scp.pl "${key_file}" ${split_scps} |
| | | _opts= |
| | | if [ -n "${inference_config}" ]; then |
| | | _opts+="--config ${inference_config} " |
| | | fi |
| | | ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ |
| | | python -m funasr.bin.asr_inference_launch \ |
| | | --batch_size 1 \ |
| | | --ngpu "${_ngpu}" \ |
| | | --njob ${njob} \ |
| | | --gpuid_list ${gpuid_list} \ |
| | | --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \ |
| | | --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ |
| | | --key_file "${_logdir}"/keys.JOB.scp \ |
| | | --asr_train_config "${asr_exp}"/config.yaml \ |
| | | --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ |
| | | --output_dir "${_logdir}"/output.JOB \ |
| | | --mode asr \ |
| | | ${_opts} |
| | | |
| | | for f in token token_int score text; do |
| | | if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${_nj}"); do |
| | | cat "${_logdir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${_dir}/${f}" |
| | | fi |
| | | done |
| | | python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer |
| | | tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt |
| | | cat ${_dir}/text.cer.txt |
| | | done |
| | | fi |
| New file |
| | |
| | | ../../aishell/transformer/utils |
| | |
| | | default=None, |
| | | help="The number of input dimension of the feature", |
| | | ) |
| | | task_parser.add_argument( |
| | | "--cmvn_file", |
| | | type=str_or_none, |
| | | default=None, |
| | | help="The path of cmvn file.", |
| | | ) |
| | | |
| | | elif args.task_name == "lm": |
| | | from funasr.build_utils.build_lm_model import class_choices_list |
| | |
| | | from funasr.models.encoder.data2vec_encoder import Data2VecEncoder |
| | | from funasr.models.frontend.default import DefaultFrontend |
| | | from funasr.models.frontend.windowing import SlidingWindow |
| | | from funasr.models.frontend.wav_frontend import WavFrontend |
| | | from funasr.models.specaug.specaug import SpecAug |
| | | from funasr.torch_utils.initialize import initialize |
| | | from funasr.train.class_choices import ClassChoices |
| | | |
| | | frontend_choices = ClassChoices( |
| | | name="frontend", |
| | | classes=dict(default=DefaultFrontend, sliding_window=SlidingWindow), |
| | | classes=dict( |
| | | default=DefaultFrontend, |
| | | sliding_window=SlidingWindow, |
| | | wav_frontend=WavFrontend, |
| | | ), |
| | | default="default", |
| | | ) |
| | | specaug_choices = ClassChoices( |
| | |
| | | import torch |
| | | |
| | | from funasr.layers.abs_normalize import AbsNormalize |
| | | from funasr.models.base_model import FunASRModel |
| | | from funasr.models.encoder.abs_encoder import AbsEncoder |
| | | from funasr.models.frontend.abs_frontend import AbsFrontend |
| | | from funasr.models.preencoder.abs_preencoder import AbsPreEncoder |
| | | from funasr.models.specaug.abs_specaug import AbsSpecAug |
| | | from funasr.torch_utils.device_funcs import force_gatherable |
| | | from funasr.models.base_model import FunASRModel |
| | | |
| | | if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): |
| | | from torch.cuda.amp import autocast |
| | |
| | | frontend: Optional[AbsFrontend], |
| | | specaug: Optional[AbsSpecAug], |
| | | normalize: Optional[AbsNormalize], |
| | | preencoder: Optional[AbsPreEncoder], |
| | | encoder: AbsEncoder, |
| | | preencoder: Optional[AbsPreEncoder] = None, |
| | | ): |
| | | |
| | | super().__init__() |