lm: transformer
lm_conf:
    pos_enc: null
    embed_unit: 128
    att_unit: 512
    head: 8
    unit: 2048
    layer: 16
    dropout_rate: 0.1

# optimization related
grad_clip: 5.0
batch_type: numel
batch_bins: 6000000
accum_grad: 1
max_epoch: 15  # 15epoch is enougth

optim: adam
optim_conf:
   lr: 0.001
scheduler: warmuplr
scheduler_conf:
   warmup_steps: 25000

best_model_criterion:
-   - valid
    - loss
    - min
keep_nbest_models: 10  # 10 is good.

log_interval: 50