From 20c35cdbc7ab0962629c9d88fd0ca07ad3968050 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 22 二月 2024 12:07:30 +0800
Subject: [PATCH] Dev gzf (#1379)
---
examples/industrial_data_pretraining/paraformer/finetune.sh | 49 +++++++++++++++++++++++++++++++++----------------
1 files changed, 33 insertions(+), 16 deletions(-)
diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh
index 266346c..21b29b6 100644
--- a/examples/industrial_data_pretraining/paraformer/finetune.sh
+++ b/examples/industrial_data_pretraining/paraformer/finetune.sh
@@ -1,9 +1,14 @@
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
-## download model
-#local_path_root=../modelscope_models
-#mkdir -p ${local_path_root}
-#local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-#git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
+# method1, finetune from model hub
+
+# which gpu to train or finetune
+export CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+
+# data dir, which contains: train.json, val.json
+data_dir="/Users/zhifu/funasr1.0/data/list"
## generate jsonl from wav.scp and text.txt
#python -m funasr.datasets.audio_datasets.scp2jsonl \
@@ -11,17 +16,29 @@
#++data_type_list='["source", "target"]' \
#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+train_data="${data_dir}/train.jsonl"
+val_data="${data_dir}/val.jsonl"
-# torchrun \
-# --nnodes 1 \
-# --nproc_per_node 1 \
-python funasr/bin/train.py \
-+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-+model_revision="v2.0.4" \
-+train_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len_10.jsonl" \
-+valid_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len_10.jsonl" \
-++dataset_conf.batch_size=64 \
+
+# exp output dir
+output_dir="/Users/zhifu/exp"
+log_file="${output_dir}/log.txt"
+
+
+mkdir -p ${output_dir}
+echo "log_file: ${log_file}"
+
+torchrun \
+--nnodes 1 \
+--nproc_per_node ${gpu_num} \
+funasr/bin/train.py \
+++model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+++model_revision="v2.0.4" \
+++train_data_set_list="${train_data}" \
+++valid_data_set_list="${val_data}" \
+++dataset_conf.batch_size=32 \
++dataset_conf.batch_type="example" \
-++train_conf.max_epoch=2 \
++dataset_conf.num_workers=4 \
-+output_dir="outputs/debug/ckpt/funasr2/exp2"
\ No newline at end of file
+++train_conf.max_epoch=20 \
+++optim_conf.lr=0.0002 \
+++output_dir="${output_dir}" &> ${log_file}
\ No newline at end of file
--
Gitblit v1.9.1