From 82e5ca37a8bd80f56c99f9d790a03b458ced716b Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 25 二月 2025 14:28:34 +0800
Subject: [PATCH] Large-Scale Data Training

---
 docs/tutorial/README_zh.md |   27 +++++++++++++++++++++++----
 1 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/docs/tutorial/README_zh.md b/docs/tutorial/README_zh.md
index 5bf15f0..6cbd9c3 100644
--- a/docs/tutorial/README_zh.md
+++ b/docs/tutorial/README_zh.md
@@ -213,7 +213,7 @@
 ### 璇︾粏鍙傛暟浠嬬粛
 
 ```shell
-funasr/bin/train.py \
+funasr/bin/train_ds.py \
 ++model="${model_name_or_model_dir}" \
 ++train_data_set_list="${train_data}" \
 ++valid_data_set_list="${val_data}" \
@@ -258,7 +258,7 @@
 gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 
 torchrun --nnodes 1 --nproc_per_node ${gpu_num} \
-../../../funasr/bin/train.py ${train_args}
+../../../funasr/bin/train_ds.py ${train_args}
 ```
 --nnodes 琛ㄧず鍙備笌鐨勮妭鐐规�绘暟锛�--nproc_per_node 琛ㄧず姣忎釜鑺傜偣涓婅繍琛岀殑杩涚▼鏁�
 
@@ -270,7 +270,7 @@
 gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 
 torchrun --nnodes 2 --node_rank 0 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
-../../../funasr/bin/train.py ${train_args}
+../../../funasr/bin/train_ds.py ${train_args}
 ```
 鍦ㄤ粠鑺傜偣涓婏紙鍋囪IP涓�192.168.1.2锛夛紝浣犻渶瑕佺‘淇滿ASTER_ADDR鍜孧ASTER_PORT鐜鍙橀噺涓庝富鑺傜偣璁剧疆鐨勪竴鑷达紝骞惰繍琛屽悓鏍风殑鍛戒护锛�
 ```shell
@@ -278,7 +278,7 @@
 gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 
 torchrun --nnodes 2 --node_rank 1 --nproc_per_node ${gpu_num} --master_addr 192.168.1.1 --master_port 12345 \
-../../../funasr/bin/train.py ${train_args}
+../../../funasr/bin/train_ds.py ${train_args}
 ```
 
 --nnodes 琛ㄧず鍙備笌鐨勮妭鐐规�绘暟锛�--node_rank 琛ㄧず褰撳墠鑺傜偣id锛�--nproc_per_node 琛ㄧず姣忎釜鑺傜偣涓婅繍琛岀殑杩涚▼鏁帮紙閫氬父涓篻pu涓暟锛�
@@ -331,6 +331,25 @@
 ++jsonl_file_in="../../../data/list/train.jsonl"
 ```
 
+#### 澶ф暟鎹缁�
+濡傛灉鏁版嵁閲忓緢澶э紝渚嬪5涓囧皬鏃朵互涓婏紝杩欐椂鍊欏鏄撻亣鍒板唴瀛樹笉瓒崇殑闂锛岀壒鍒槸澶歡pu瀹為獙锛岃繖鏃跺�欓渶瑕佸jsonl鏂囦欢杩涜鍒囧垎鎴恠lice锛岀劧鍚庡啓鍒皌xt閲岄潰锛屼竴琛屼竴涓猻lice锛岀劧鍚庤缃甡data_split_num`锛屼緥濡傦細
+```shell
+train_data="/root/data/list/data.list"
+
+funasr/bin/train_ds.py \
+++train_data_set_list="${train_data}" \
+++dataset_conf.data_split_num=256
+```
+鍏朵腑锛�
+`data.list`锛氫负绾枃鏈紝鍐呭鏄垏鍓插悗鐨刯sonl鏂囦欢锛屼緥濡傦紝`data.list`鐨勫唴瀹逛负锛�
+```bash
+data/list/train.0.jsonl
+data/list/train.1.jsonl
+...
+```
+`data_split_num`锛氳〃绀哄垏鍒唖lice鍒嗙粍涓暟锛屼緥濡傦紝data.list涓叡512琛岋紝data_split_num=256锛岃〃绀哄垎鎴�256缁勶紝姣忕粍鏈�2涓猨sonl鏂囦欢锛岃繖鏍锋瘡娆″彧load 2涓猨sonl鏁版嵁杩涜璁粌锛屼粠鑰岄檷浣庤缁冭繃绋嬩腑鍐呭瓨浣跨敤銆傛敞鎰忔槸鎸夌収椤哄簭鍒嗙粍銆�
+濡傛灉鏄紝闈炲父澶э紝骞朵笖鏁版嵁绫诲瀷宸紓姣旇緝澶э紝寤鸿鍒囧垎鏃跺�欒繘琛屾暟鎹潎琛°��
+
 #### 鏌ョ湅璁粌鏃ュ織
 
 ##### 鏌ョ湅瀹為獙log

--
Gitblit v1.9.1