# network architecture # encoder related encoder: transformer encoder_conf: output_size: 256 # dimension of attention attention_heads: 4 linear_units: 2048 # the number of units of position-wise feed forward num_blocks: 12 # the number of encoder blocks dropout_rate: 0.1 positional_dropout_rate: 0.1 attention_dropout_rate: 0.0 input_layer: conv2d # encoder architecture type normalize_before: true # decoder related decoder: transformer decoder_conf: attention_heads: 4 linear_units: 2048 num_blocks: 6 dropout_rate: 0.1 positional_dropout_rate: 0.1 self_attention_dropout_rate: 0.0 src_attention_dropout_rate: 0.0 # hybrid CTC/attention model_conf: ctc_weight: 0.3 lsm_weight: 0.1 # label smoothing option length_normalized_loss: false # minibatch related batch_type: length batch_bins: 32000 num_workers: 8 # optimization related accum_grad: 1 grad_clip: 5 patience: 3 max_epoch: 20 val_scheduler_criterion: - valid - acc best_model_criterion: - - valid - acc - max keep_nbest_models: 10 # NoamLR is deprecated. Use WarmupLR. # The following is equivalent setting for NoamLR: # # optim: adam # optim_conf: # lr: 10. # scheduler: noamlr # scheduler_conf: # model_size: 256 # warmup_steps: 25000 # optim: adam optim_conf: lr: 0.002 scheduler: warmuplr # pytorch v1.1.0+ required scheduler_conf: warmup_steps: 25000 log_interval: 50 normalize: None