model: sond
|
model_conf:
|
lsm_weight: 0.0
|
length_normalized_loss: true
|
max_spk_num: 16
|
|
# speech encoder
|
encoder: ecapa_tdnn
|
encoder_conf:
|
# pass by model, equal to feature dim
|
# input_size: 80
|
pool_size: 20
|
stride: 1
|
speaker_encoder: conv
|
speaker_encoder_conf:
|
input_units: 256
|
num_layers: 3
|
num_units: 256
|
kernel_size: 1
|
dropout_rate: 0.0
|
position_encoder: null
|
out_units: 256
|
out_norm: false
|
auxiliary_states: false
|
tf2torch_tensor_name_prefix_torch: speaker_encoder
|
tf2torch_tensor_name_prefix_tf: EAND/speaker_encoder
|
ci_scorer: dot
|
ci_scorer_conf: {}
|
cd_scorer: san
|
cd_scorer_conf:
|
input_size: 512
|
output_size: 512
|
out_units: 1
|
attention_heads: 4
|
linear_units: 1024
|
num_blocks: 4
|
dropout_rate: 0.0
|
positional_dropout_rate: 0.0
|
attention_dropout_rate: 0.0
|
# use string "null" to remove input layer
|
input_layer: "null"
|
pos_enc_class: null
|
normalize_before: true
|
tf2torch_tensor_name_prefix_torch: cd_scorer
|
tf2torch_tensor_name_prefix_tf: EAND/compute_distance_layer
|
# post net
|
decoder: fsmn
|
decoder_conf:
|
in_units: 32
|
out_units: 2517
|
filter_size: 31
|
fsmn_num_layers: 6
|
dnn_num_layers: 1
|
num_memory_units: 512
|
ffn_inner_dim: 512
|
dropout_rate: 0.0
|
tf2torch_tensor_name_prefix_torch: decoder
|
tf2torch_tensor_name_prefix_tf: EAND/post_net
|
frontend: wav_frontend
|
frontend_conf:
|
fs: 16000
|
window: povey
|
n_mels: 80
|
frame_length: 25
|
frame_shift: 10
|
filter_length_min: -1
|
filter_length_max: -1
|
lfr_m: 1
|
lfr_n: 1
|
dither: 0.0
|
snip_edges: false
|
|
# minibatch related
|
batch_type: length
|
# 16s * 16k * 16 samples
|
batch_bins: 4096000
|
num_workers: 8
|
|
# optimization related
|
accum_grad: 1
|
grad_clip: 5
|
max_epoch: 50
|
val_scheduler_criterion:
|
- valid
|
- acc
|
best_model_criterion:
|
- - valid
|
- der
|
- min
|
- - valid
|
- forward_steps
|
- max
|
keep_nbest_models: 10
|
|
optim: adam
|
optim_conf:
|
lr: 0.001
|
scheduler: warmuplr
|
scheduler_conf:
|
warmup_steps: 10000
|
|
# without spec aug
|
specaug: null
|
specaug_conf:
|
apply_time_warp: true
|
time_warp_window: 5
|
time_warp_mode: bicubic
|
apply_freq_mask: true
|
freq_mask_width_range:
|
- 0
|
- 30
|
num_freq_mask: 2
|
apply_time_mask: true
|
time_mask_width_range:
|
- 0
|
- 40
|
num_time_mask: 2
|
|
log_interval: 50
|
# without normalize
|
normalize: None
|