From d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99 Mon Sep 17 00:00:00 2001
From: liugz18 <57401541+liugz18@users.noreply.github.com>
Date: 星期四, 18 七月 2024 21:34:55 +0800
Subject: [PATCH] Rename 'res' in line 514 to avoid with naming conflict with line 365

---
 examples/industrial_data_pretraining/sense_voice/finetune.sh |   38 ++++++++++++--------------------------
 1 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/examples/industrial_data_pretraining/sense_voice/finetune.sh b/examples/industrial_data_pretraining/sense_voice/finetune.sh
index 1191657..be6c53a 100644
--- a/examples/industrial_data_pretraining/sense_voice/finetune.sh
+++ b/examples/industrial_data_pretraining/sense_voice/finetune.sh
@@ -4,14 +4,14 @@
 workspace=`pwd`
 
 # which gpu to train or finetune
-export CUDA_VISIBLE_DEVICES="0"
+export CUDA_VISIBLE_DEVICES="0,1"
 gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 
 # model_name from model_hub, or model_dir in local path
 
 ## option 1, download model automatically
-model_name_or_model_dir="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model_name_or_model_dir="/Users/zhifu/Downloads/modelscope_models/SenseVoiceModelscope"
+model_name_or_model_dir="iic/SenseVoiceCTC"
+
 ## option 2, download model by git
 #local_path_root=${workspace}/modelscope_models
 #mkdir -p ${local_path_root}/${model_name_or_model_dir}
@@ -20,50 +20,36 @@
 
 
 # data dir, which contains: train.json, val.json
-data_dir="../../../data/list"
-
-train_data="${data_dir}/train.jsonl"
-val_data="${data_dir}/val.jsonl"
-
-# generate train.jsonl and val.jsonl from wav.scp and text.txt
-scp2jsonl \
-++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
-++data_type_list='["source", "target"]' \
-++jsonl_file_out="${train_data}"
-
-scp2jsonl \
-++scp_file_list='["../../../data/list/val_wav.scp", "../../../data/list/val_text.txt"]' \
-++data_type_list='["source", "target"]' \
-++jsonl_file_out="${val_data}"
-
+train_data=${workspace}/data/train_example.jsonl
+val_data=${workspace}/data/val_example.jsonl
 
 # exp output dir
 output_dir="./outputs"
 log_file="${output_dir}/log.txt"
 
+deepspeed_config=${workspace}/../../ds_stage1.json
 
 mkdir -p ${output_dir}
 echo "log_file: ${log_file}"
-
-deepspeed_config=${workspace}../../ds_stage1.json
 
 DISTRIBUTED_ARGS="
     --nnodes ${WORLD_SIZE:-1} \
     --nproc_per_node $gpu_num \
     --node_rank ${RANK:-0} \
     --master_addr ${MASTER_ADDR:-127.0.0.1} \
-    --master_port ${MASTER_PORT: 26669}
+    --master_port ${MASTER_PORT:-26669}
 "
 
 echo $DISTRIBUTED_ARGS
 
+# funasr trainer path
+train_tool=`dirname $(which funasr)`/train_ds.py
+
 torchrun $DISTRIBUTED_ARGS \
-../../../funasr/bin/train_ds.py \
+${train_tool} \
 ++model="${model_name_or_model_dir}" \
 ++train_data_set_list="${train_data}" \
 ++valid_data_set_list="${val_data}" \
-++dataset="SenseVoiceDataset" \
-++dataset_conf.IndexDSJsonl="IndexDSJsonl" \
 ++dataset_conf.data_split_num=1 \
 ++dataset_conf.batch_sampler="BatchSampler" \
 ++dataset_conf.batch_size=6000  \
@@ -80,4 +66,4 @@
 ++train_conf.use_deepspeed=false \
 ++train_conf.deepspeed_config=${deepspeed_config} \
 ++optim_conf.lr=0.0002 \
-++output_dir="${output_dir}" #&> ${log_file}
\ No newline at end of file
+++output_dir="${output_dir}" &> ${log_file}
\ No newline at end of file

--
Gitblit v1.9.1