| | |
| | | self_attention_dropout_rate: 0.0 |
| | | src_attention_dropout_rate: 0.0 |
| | | |
| | | # frontend related |
| | | frontend: wav_frontend |
| | | frontend_conf: |
| | | fs: 16000 |
| | | window: hamming |
| | | n_mels: 80 |
| | | frame_length: 25 |
| | | frame_shift: 10 |
| | | lfr_m: 1 |
| | | lfr_n: 1 |
| | | |
| | | # hybrid CTC/attention |
| | | model: paraformer_bert |
| | | model_conf: |
| | |
| | | lsm_weight: 0.1 # label smoothing option |
| | | length_normalized_loss: false |
| | | predictor_weight: 1.0 |
| | | glat_context_p: 0.4 |
| | | sampling_ratio: 0.4 |
| | | embeds_id: 3 |
| | | embed_dims: 768 |
| | | embeds_loss_weight: 2.0 |
| | |
| | | accum_grad: 2 |
| | | grad_clip: 5 |
| | | patience: none |
| | | max_epoch: 50 |
| | | max_epoch: 150 |
| | | val_scheduler_criterion: |
| | | - valid |
| | | - acc |
| | |
| | | - 40 |
| | | num_time_mask: 2 |
| | | |
| | | predictor: cif_predictor_sanm |
| | | predictor: cif_predictor |
| | | predictor_conf: |
| | | idim: 320 |
| | | threshold: 1.0 |
| | | l_order: 1 |
| | | r_order: 1 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |
| | | |
| | | dataset_conf: |
| | | data_names: speech,text,embed |
| | | data_types: kaldi_ark,text,kaldi_ark |
| | | data_types: sound,text,kaldi_ark |
| | | shuffle: True |
| | | shuffle_conf: |
| | | shuffle_size: 10240 |
| | | shuffle_size: 2048 |
| | | sort_size: 500 |
| | | batch_conf: |
| | | batch_type: token |
| | | batch_size: 25000 |
| | | num_workers: 8 |
| | | num_workers: 8 |
| | | |
| | | log_interval: 50 |
| | | normalize: None |