From 6a7d34392c5cc3bdb512670aadb7847ae7916741 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 06 六月 2024 15:43:44 +0800
Subject: [PATCH] auto frontend

---
 examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh     |   46 +++++++++++++++
 examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml |   78 ++++++++++++++++++++++++++
 2 files changed, 124 insertions(+), 0 deletions(-)

diff --git a/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml b/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml
new file mode 100644
index 0000000..59e93a6
--- /dev/null
+++ b/examples/industrial_data_pretraining/llm_asr/conf/whisper_qwen_linear2.yaml
@@ -0,0 +1,78 @@
+# This is an example that demonstrates how to configure a model file.
+# You can modify the configuration according to your own requirements.
+
+# to print the register_table:
+# from funasr.register import tables
+# tables.print()
+
+# network architecture
+model: LLMASR2
+model_conf:
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: true
+
+# encoder
+audio_encoder: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope"
+audio_encoder_conf:
+    hub: ms
+    freeze: true
+
+llm: Qwen1.5-7b-chat
+llm_conf:
+  hub: hf
+  freeze: true
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
+audio_adaptor: Transformer
+audio_adaptor_conf:
+  downsample_rate: 2
+  llm_dim: 4096
+  encoder_dim: 1280
+  n_layer: 2
+
+# frontend related
+frontend: WhisperFrontend
+frontend_conf:
+    fs: 16000
+    whisper_model: large-v3
+    do_pad_trim: true
+    permute: false # true: [bs, frames, dims]; false: [bs, dims, frames]
+    filters_path: "/nfs/zhifu.gzf/init_model/SenseVoiceModelscope/assets/mel_filters.npz"
+
+
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 15
+  keep_nbest_models: 10
+  log_interval: 10
+
+optim: adamw
+optim_conf:
+   lr: 0.0001
+   weight_decay: 0.000000
+
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 1500
+
+dataset: OpenAIDataset
+dataset_conf:
+    index_ds: OpenAIIndexDSJsonl
+    batch_sampler: CustomDistributedBatchSampler
+    batch_type: example # example or length
+    batch_size: 4 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
+    max_token_length: 3000 # filter samples if source_token_len+target_token_len > max_token_length,
+    shuffle: True
+    num_workers: 0
+    audio_adaptor_downsample_rate: ${audio_adaptor_conf.downsample_rate}
+    audio_encoder_downsample_rate: 2
+#    prompt: "<|startoftranscription|><|zh|><|transcribe|><|zh|><|notimestamps|><|wo_itn|>"
+
+
+
+tokenizer: HuggingfaceTokenizer
+tokenizer_conf:
+  init_param_path: "/nfs/zhifu.gzf/init_model/qwen/Qwen1___5-7B-Chat_raw"
+
diff --git a/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh
new file mode 100644
index 0000000..b3aac2b
--- /dev/null
+++ b/examples/industrial_data_pretraining/llm_asr/demo_train_or_finetune2.sh
@@ -0,0 +1,46 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json, tokens.jsonl/tokens.txt, am.mvn
+#data_dir="/Users/zhifu/funasr1.0/data/list"
+
+## generate jsonl from wav.scp and text.txt
+#python -m funasr.datasets.audio_datasets.scp2jsonl \
+#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+#++data_type_list='["source", "target"]' \
+#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+
+train_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
+val_data="/nfs/beinian.lzr/workspace/tools/speech2speech_tools/speech2text/out_dir/tmp_wav.jsonl"
+
+# exp output dir
+output_dir="/Users/zhifu/funasr1.0/test_local/data_tmp/"
+log_file="${output_dir}/log.txt"
+
+workspace=`pwd`
+config="whisper_qwen_linear2.yaml"
+
+init_param="${output_dir}/model.pt"
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+torchrun \
+--nnodes 1 \
+--nproc_per_node ${gpu_num} \
+../../../funasr/bin/train.py \
+--config-path "${workspace}/conf" \
+--config-name "${config}" \
+++train_data_set_list="${train_data}" \
+++valid_data_set_list="${val_data}" \
+++dataset_conf.batch_size=1 \
+++dataset_conf.num_workers=0 \
+++train_conf.max_epoch=15 \
+++optim_conf.lr=0.0001 \
+++init_param="${init_param}" \
+++output_dir="${output_dir}" &> ${log_file} &

--
Gitblit v1.9.1