1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
| # network architecture
| model: funasr.cli.models.paraformer:Paraformer
| model_conf:
| ctc_weight: 0.0
| lsm_weight: 0.1
| length_normalized_loss: true
| predictor_weight: 1.0
| predictor_bias: 1
| sampling_ratio: 0.75
|
| # encoder
| encoder: sanm
| encoder_conf:
| output_size: 512
| attention_heads: 4
| linear_units: 2048
| num_blocks: 50
| dropout_rate: 0.1
| positional_dropout_rate: 0.1
| attention_dropout_rate: 0.1
| input_layer: pe
| pos_enc_class: SinusoidalPositionEncoder
| normalize_before: true
| kernel_size: 11
| sanm_shfit: 0
| selfattention_layer_type: sanm
|
| # decoder
| decoder: paraformer_decoder_sanm
| decoder_conf:
| attention_heads: 4
| linear_units: 2048
| num_blocks: 16
| dropout_rate: 0.1
| positional_dropout_rate: 0.1
| self_attention_dropout_rate: 0.1
| src_attention_dropout_rate: 0.1
| att_layer_num: 16
| kernel_size: 11
| sanm_shfit: 0
|
| predictor: cif_predictor_v2
| predictor_conf:
| idim: 512
| threshold: 1.0
| l_order: 1
| r_order: 1
| tail_threshold: 0.45
|
| # frontend related
| frontend: wav_frontend
| frontend_conf:
| fs: 16000
| window: hamming
| n_mels: 80
| frame_length: 25
| frame_shift: 10
| lfr_m: 7
| lfr_n: 6
|
| specaug: specaug_lfr
| specaug_conf:
| apply_time_warp: false
| time_warp_window: 5
| time_warp_mode: bicubic
| apply_freq_mask: true
| freq_mask_width_range:
| - 0
| - 30
| lfr_rate: 6
| num_freq_mask: 1
| apply_time_mask: true
| time_mask_width_range:
| - 0
| - 12
| num_time_mask: 1
|
| train_conf:
| accum_grad: 1
| grad_clip: 5
| max_epoch: 150
| val_scheduler_criterion:
| - valid
| - acc
| best_model_criterion:
| - - valid
| - acc
| - max
| keep_nbest_models: 10
| log_interval: 50
|
| optim: adam
| optim_conf:
| lr: 0.0005
| scheduler: warmuplr
| scheduler_conf:
| warmup_steps: 30000
|
|
| dataset_conf:
| data_names: speech,text
| data_types: sound,text
| shuffle: True
| shuffle_conf:
| shuffle_size: 2048
| sort_size: 500
| batch_conf:
| batch_type: example
| batch_size: 2
| num_workers: 8
|
| split_with_space: true
| input_size: 560
| ctc_conf:
| dropout_rate: 0.0
| ctc_type: builtin
| reduce: true
| ignore_nan_grad: true
| normalize: null
|
|