From 5635bfec22948447387613b6c9d5a0c5dbbd5ac4 Mon Sep 17 00:00:00 2001 From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com> Date: 星期四, 25 五月 2023 11:20:47 +0800 Subject: [PATCH] update repo --- egs/wenetspeech/conformer/conf/train_asr_conformer.yaml | 104 ++++++++++++++++++++++++++++++++++ egs/wenetspeech/conformer/run.sh | 13 ++++ egs/wenetspeech/conformer/path.sh | 5 + egs/wenetspeech/conformer/utils | 1 egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml | 6 ++ 5 files changed, 129 insertions(+), 0 deletions(-) diff --git a/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml b/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml new file mode 100644 index 0000000..e35e820 --- /dev/null +++ b/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml @@ -0,0 +1,6 @@ +beam_size: 5 +penalty: 0.0 +maxlenratio: 0.0 +minlenratio: 0.0 +ctc_weight: 0.5 +lm_weight: 0.7 diff --git a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml new file mode 100644 index 0000000..9842fa4 --- /dev/null +++ b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml @@ -0,0 +1,104 @@ +# network architecture +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + macaron_style: true + use_cnn_module: true + cnn_module_kernel: 15 + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# CTC realted +ctc_conf: + ignore_nan_grad: true + +# frontend related +frontend: wav_frontend +frontend_conf: + fs: 16000 + window: hamming + n_mels: 80 + frame_length: 25 + frame_shift: 10 + lfr_m: 1 + lfr_n: 1 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# optimization related +accum_grad: 4 +grad_clip: 5 +patience: none +max_epoch: 30 +val_scheduler_criterion: + - valid + - acc +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 + +optim: adam +optim_conf: + lr: 0.0015 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 30000 + +specaug: specaug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 30 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_range: + - 0 + - 40 + num_time_mask: 2 + +dataset_conf: + data_names: speech,text + data_types: sound,text + shuffle: True + shuffle_conf: + shuffle_size: 2048 + sort_size: 500 + batch_conf: + batch_type: token + batch_size: 32000 + num_workers: 8 + +log_interval: 50 +normalize: None diff --git a/egs/wenetspeech/conformer/path.sh b/egs/wenetspeech/conformer/path.sh new file mode 100755 index 0000000..7972642 --- /dev/null +++ b/egs/wenetspeech/conformer/path.sh @@ -0,0 +1,5 @@ +export FUNASR_DIR=$PWD/../../.. + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PATH=$FUNASR_DIR/funasr/bin:$PATH diff --git a/egs/wenetspeech/conformer/run.sh b/egs/wenetspeech/conformer/run.sh new file mode 100644 index 0000000..4e9b36e --- /dev/null +++ b/egs/wenetspeech/conformer/run.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +. ./path.sh || exit 1; + +# machines configuration +CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +gpu_num=8 +count=1 +gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding +# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob +njob=5 +train_cmd=utils/run.pl +infer_cmd=utils/run.pl \ No newline at end of file diff --git a/egs/wenetspeech/conformer/utils b/egs/wenetspeech/conformer/utils new file mode 120000 index 0000000..fe070dd --- /dev/null +++ b/egs/wenetspeech/conformer/utils @@ -0,0 +1 @@ +../../aishell/transformer/utils \ No newline at end of file -- Gitblit v1.9.1