From 824377d2aae11dc9ebbde871e3b23a0e0cadc7af Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 17 四月 2024 16:59:29 +0800
Subject: [PATCH] Dev gzf exp (#1626)

---
 funasr/models/sense_voice/template.yaml |   91 +++++++++++++++++++++++++++++++++------------
 1 files changed, 67 insertions(+), 24 deletions(-)

diff --git a/funasr/models/sense_voice/template.yaml b/funasr/models/sense_voice/template.yaml
index 5ebace9..4699c94 100644
--- a/funasr/models/sense_voice/template.yaml
+++ b/funasr/models/sense_voice/template.yaml
@@ -4,42 +4,85 @@
 # to print the register_table:
 # from funasr.register import tables
 # tables.print()
-
 # network architecture
 model: SenseVoice
 model_conf:
     lsm_weight: 0.1
     length_normalized_loss: true
-    hub: funasr
+    activation_checkpoint: true
+    sos: "<|startoftranscript|>"
+    eos: "<|endoftext|>"
+    downsample_rate: 4
+    use_padmask: true
 
-
-
-# only use for hub == funasr,
-#  if hub == openai, dims is automaticall download
-dims:
-    n_mels: 128
-    n_vocab: 51866
-    n_audio_ctx: 1500
-    n_audio_state: 1280
-    n_audio_head: 20
-    n_audio_layer: 32
-    n_text_ctx: 448
-    n_text_state: 1280
-    n_text_head: 20
-    n_text_layer: 32
+    dims:
+        n_mels: 128
+        n_vocab: 60515
+        n_audio_ctx: 1500
+        n_audio_state: 1280
+        n_audio_head: 20
+        n_audio_layer: 32
+        n_text_ctx: 448
+        n_text_state: 1280
+        n_text_head: 20
+        n_text_layer: 32
 
 # frontend related
 frontend: WhisperFrontend
 frontend_conf:
     fs: 16000
-    n_mels: ${dims.n_mels}
-    do_pad_trim: true
+    n_mels: ${model_conf.dims.n_mels}
+    do_pad_trim: false
 
-tokenizer: WhisperTokenizer
+tokenizer: SenseVoiceTokenizer
 tokenizer_conf:
-  language: null
-  task: transcribe
+  vocab_path: null
   is_multilingual: true
-  num_languages: 100
+  num_languages: 8749
 
-scope_map: [none, "model."]
\ No newline at end of file
+dataset: SenseVoiceDataset
+dataset_conf:
+    index_ds: IndexDSJsonl
+    batch_sampler: EspnetStyleBatchSampler
+    batch_type: length # example or length
+    batch_size: 7000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 2000 # filter samples if source_token_len+target_token_len > max_token_length,
+    min_token_length: 60
+    shuffle: True
+    num_workers: 4
+    sos: ${model_conf.sos}
+    eos: ${model_conf.eos}
+
+train_conf:
+  accum_grad: 2
+  grad_clip: 5
+  max_epoch: 20
+  keep_nbest_models: 20
+  avg_nbest_model: ${train_conf.keep_nbest_models}
+  log_interval: 50
+
+optim: adamw
+optim_conf:
+    lr: 0.00002
+
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 10000
+
+specaug: SpecAug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 40
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.0
+    - 0.12
+    num_time_mask: 2
+
+scope_map: ['encoder.encoders', 'model.encoder', 'decoder.decoders', 'model.decoder']
\ No newline at end of file

--
Gitblit v1.9.1