| | |
| | | encoder: chunk_conformer |
| | | encoder_conf: |
| | | main_conf: |
| | | pos_wise_act_type: swish |
| | | pos_enc_dropout_rate: 0.3 |
| | | conv_mod_act_type: swish |
| | | activation_type: swish |
| | | positional_dropout_rate: 0.5 |
| | | time_reduction_factor: 2 |
| | | unified_model_training: true |
| | | default_chunk_size: 16 |
| | | jitter_range: 4 |
| | | left_chunk_size: 1 |
| | | input_conf: |
| | | block_type: conv2d |
| | | conv_size: 512 |
| | | embed_vgg_like: false |
| | | subsampling_factor: 4 |
| | | num_frame: 1 |
| | | body_conf: |
| | | - block_type: conformer |
| | | linear_size: 2048 |
| | | hidden_size: 512 |
| | | heads: 8 |
| | | dropout_rate: 0.3 |
| | | pos_wise_dropout_rate: 0.3 |
| | | att_dropout_rate: 0.3 |
| | | conv_mod_kernel_size: 15 |
| | | linear_units: 2048 |
| | | output_size: 512 |
| | | attention_heads: 8 |
| | | dropout_rate: 0.5 |
| | | positional_dropout_rate: 0.5 |
| | | attention_dropout_rate: 0.5 |
| | | cnn_module_kernel: 15 |
| | | num_blocks: 12 |
| | | |
| | | # decoder related |
| | | decoder: rnn |
| | | decoder_conf: |
| | | rnnt_decoder: rnnt |
| | | rnnt_decoder_conf: |
| | | embed_size: 512 |
| | | hidden_size: 512 |
| | | embed_dropout_rate: 0.2 |
| | | dropout_rate: 0.1 |
| | | |
| | | embed_dropout_rate: 0.5 |
| | | dropout_rate: 0.5 |
| | | joint_network_conf: |
| | | joint_space_size: 512 |
| | | |
| | | # frontend related |
| | | frontend: wav_frontend |
| | | frontend_conf: |
| | | fs: 16000 |
| | | window: hamming |
| | | n_mels: 80 |
| | | frame_length: 25 |
| | | frame_shift: 10 |
| | | lfr_m: 1 |
| | | lfr_n: 1 |
| | | |
| | | |
| | | # Auxiliary CTC |
| | | model: rnnt_unified |
| | | model_conf: |
| | | auxiliary_ctc_weight: 0.0 |
| | | |
| | | # minibatch related |
| | | use_amp: true |
| | | batch_type: numel |
| | | batch_bins: 1600000 |
| | | num_workers: 16 |
| | | |
| | | # optimization related |
| | | accum_grad: 1 |
| | | grad_clip: 5 |
| | | max_epoch: 80 |
| | | max_epoch: 120 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - loss |
| | |
| | | - - valid |
| | | - cer_transducer_chunk |
| | | - min |
| | | keep_nbest_models: 5 |
| | | keep_nbest_models: 10 |
| | | |
| | | optim: adam |
| | | optim_conf: |
| | | lr: 0.0003 |
| | | lr: 0.001 |
| | | scheduler: warmuplr |
| | | scheduler_conf: |
| | | warmup_steps: 25000 |
| | | |
| | | normalize: None |
| | | |
| | | specaug: specaug |
| | | specaug_conf: |
| | |
| | | apply_freq_mask: true |
| | | freq_mask_width_range: |
| | | - 0 |
| | | - 30 |
| | | - 40 |
| | | num_freq_mask: 2 |
| | | apply_time_mask: true |
| | | time_mask_width_range: |
| | | - 0 |
| | | - 40 |
| | | num_time_mask: 2 |
| | | - 50 |
| | | num_time_mask: 5 |
| | | |
| | | dataset_conf: |
| | | data_names: speech,text |
| | | data_types: sound,text |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 2048 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 16000 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |