model: sond model_conf: lsm_weight: 0.0 length_normalized_loss: true max_spk_num: 16 # speech encoder encoder: ecapa_tdnn encoder_conf: # pass by model, equal to feature dim # input_size: 80 pool_size: 20 stride: 1 speaker_encoder: conv speaker_encoder_conf: input_units: 256 num_layers: 3 num_units: 256 kernel_size: 1 dropout_rate: 0.0 position_encoder: null out_units: 256 out_norm: false auxiliary_states: false tf2torch_tensor_name_prefix_torch: speaker_encoder tf2torch_tensor_name_prefix_tf: EAND/speaker_encoder ci_scorer: dot ci_scorer_conf: {} cd_scorer: san cd_scorer_conf: input_size: 512 output_size: 512 out_units: 1 attention_heads: 4 linear_units: 1024 num_blocks: 4 dropout_rate: 0.0 positional_dropout_rate: 0.0 attention_dropout_rate: 0.0 # use string "null" to remove input layer input_layer: "null" pos_enc_class: null normalize_before: true tf2torch_tensor_name_prefix_torch: cd_scorer tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer # post net decoder: fsmn decoder_conf: in_units: 32 out_units: 2517 filter_size: 31 fsmn_num_layers: 6 dnn_num_layers: 1 num_memory_units: 512 ffn_inner_dim: 512 dropout_rate: 0.0 tf2torch_tensor_name_prefix_torch: decoder tf2torch_tensor_name_prefix_tf: EAND/post_net frontend: wav_frontend frontend_conf: fs: 16000 window: povey n_mels: 80 frame_length: 25 frame_shift: 10 filter_length_min: -1 filter_length_max: -1 lfr_m: 1 lfr_n: 1 dither: 0.0 snip_edges: false # minibatch related batch_type: length # 16s * 16k * 16 samples batch_bins: 4096000 num_workers: 8 # optimization related accum_grad: 1 grad_clip: 5 max_epoch: 50 val_scheduler_criterion: - valid - acc best_model_criterion: - - valid - der - min - - valid - forward_steps - max keep_nbest_models: 10 optim: adam optim_conf: lr: 0.001 scheduler: warmuplr scheduler_conf: warmup_steps: 10000 # without spec aug specaug: null specaug_conf: apply_time_warp: true time_warp_window: 5 time_warp_mode: bicubic apply_freq_mask: true freq_mask_width_range: - 0 - 30 num_freq_mask: 2 apply_time_mask: true time_mask_width_range: - 0 - 40 num_time_mask: 2 log_interval: 50 # without normalize normalize: None