游雁
2023-09-13 33d3d2084403fd34b79c835d2f2fe04f6cd8f738
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# network architecture
frontend: multichannelfrontend
frontend_conf:
    fs: 16000
    window: hann
    n_fft: 400
    n_mels: 80
    frame_length: 25
    frame_shift: 10
    lfr_m: 1
    lfr_n: 1
    use_channel: 0
    mc: False
 
# encoder related
asr_encoder: conformer
asr_encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder architecture type
    normalize_before: true
    pos_enc_layer_type: rel_pos
    selfattention_layer_type: rel_selfattn
    activation_type: swish
    macaron_style: true
    use_cnn_module: true
    cnn_module_kernel: 15
 
spk_encoder: resnet34_diar
spk_encoder_conf:
  use_head_conv: true
  batchnorm_momentum: 0.5
  use_head_maxpool: false
  num_nodes_pooling_layer: 256
  layers_in_block:
    - 3
    - 4
    - 6
    - 3
  filters_in_block:
    - 32
    - 64
    - 128
    - 256
  pooling_type: statistic
  num_nodes_resnet1: 256
  num_nodes_last_layer: 256
  batchnorm_momentum: 0.5
 
# decoder related
decoder: sa_decoder
decoder_conf:
    attention_heads: 4
    linear_units: 2048
    asr_num_blocks: 6
    spk_num_blocks: 3
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 
# hybrid CTC/attention
model_conf:
    spk_weight: 0.5
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
    max_spk_num: 4
 
ctc_conf:
    ignore_nan_grad: true
 
# minibatch related
dataset_conf:
    data_names: speech,text,profile,text_id
    data_types: sound,text,npy,text_int
    shuffle: True
    shuffle_conf:
        shuffle_size: 2048
        sort_size: 500
    batch_conf:
        batch_type: token
        batch_size: 7000
    num_workers: 8
 
# optimization related
accum_grad: 1
grad_clip: 5
max_epoch: 60
val_scheduler_criterion:
    - valid
    - loss
best_model_criterion:
-   - valid
    - acc
    - max
-   - valid
    - acc_spk
    - max
-   - valid
    - loss
    - min
keep_nbest_models: 10
 
optim: adam
optim_conf:
   lr: 0.0005
scheduler: warmuplr
scheduler_conf:
   warmup_steps: 8000
 
specaug: specaug
specaug_conf:
    apply_time_warp: true
    time_warp_window: 5
    time_warp_mode: bicubic
    apply_freq_mask: true
    freq_mask_width_range:
    - 0
    - 30
    num_freq_mask: 2
    apply_time_mask: true
    time_mask_width_range:
    - 0
    - 40
    num_time_mask: 2