python/FunASR-XL.git

# network architecture
frontend: multichannelfrontend
frontend_conf:
    fs: 16000
    window: hann
    n_fft: 400
    n_mels: 80
    frame_length: 25
    frame_shift: 10
    lfr_m: 1
    lfr_n: 1
    use_channel: 0
    mc: False
 
# encoder related
asr_encoder: conformer
asr_encoder_conf:
    output_size: 256    # dimension of attention
    attention_heads: 4
    linear_units: 2048  # the number of units of position-wise feed forward
    num_blocks: 12      # the number of encoder blocks
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    attention_dropout_rate: 0.0
    input_layer: conv2d # encoder architecture type
    normalize_before: true
    pos_enc_layer_type: rel_pos
    selfattention_layer_type: rel_selfattn
    activation_type: swish
    macaron_style: true
    use_cnn_module: true
    cnn_module_kernel: 15
 
spk_encoder: resnet34_diar
spk_encoder_conf:
  use_head_conv: true
  batchnorm_momentum: 0.5
  use_head_maxpool: false
  num_nodes_pooling_layer: 256
  layers_in_block:
    - 3
    - 4
    - 6
    - 3
  filters_in_block:
    - 32
    - 64
    - 128
    - 256
  pooling_type: statistic
  num_nodes_resnet1: 256
  num_nodes_last_layer: 256
  batchnorm_momentum: 0.5
 
# decoder related
decoder: sa_decoder
decoder_conf:
    attention_heads: 4
    linear_units: 2048
    asr_num_blocks: 6
    spk_num_blocks: 3
    dropout_rate: 0.1
    positional_dropout_rate: 0.1
    self_attention_dropout_rate: 0.0
    src_attention_dropout_rate: 0.0
 
# hybrid CTC/attention
model_conf:
    spk_weight: 0.5
    ctc_weight: 0.3
    lsm_weight: 0.1     # label smoothing option
    length_normalized_loss: false
    max_spk_num: 4
 
ctc_conf:
    ignore_nan_grad: true
 
# minibatch related
dataset_conf:
    data_names: speech,text,profile,text_id
    data_types: sound,text,npy,text_int
    shuffle: True
    shuffle_conf:
        shuffle_size: 2048
        sort_size: 500
    batch_conf:
        batch_type: token
        batch_size: 7000
    num_workers: 8
 
# optimization related
accum_grad: 1
grad_clip: 5
max_epoch: 60
val_scheduler_criterion:
    - valid
    - loss
best_model_criterion:
-   - valid
    - acc
    - max
-   - valid
    - acc_spk
    - max
-   - valid
    - loss
    - min
keep_nbest_models: 10
 
optim: adam
optim_conf:
   lr: 0.0005
scheduler: warmuplr
scheduler_conf:
   warmup_steps: 8000
 
specaug: specaug
specaug_conf:
    apply_time_warp: true
    time_warp_window: 5
    time_warp_mode: bicubic
    apply_freq_mask: true
    freq_mask_width_range:
    - 0
    - 30
    num_freq_mask: 2
    apply_time_mask: true
    time_mask_width_range:
    - 0
    - 40
    num_time_mask: 2