From 6ce4909ecc39b990e253d692afc4b28451cde33e Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 22 三月 2024 21:08:55 +0800
Subject: [PATCH] update

---
 docs/tutorial/README_zh.md |  211 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 199 insertions(+), 12 deletions(-)

diff --git a/docs/tutorial/README_zh.md b/docs/tutorial/README_zh.md
index fad5893..563fe7c 100644
--- a/docs/tutorial/README_zh.md
+++ b/docs/tutorial/README_zh.md
@@ -2,11 +2,26 @@
 
 FunASR寮�婧愪簡澶ч噺鍦ㄥ伐涓氭暟鎹笂棰勮缁冩ā鍨嬶紝鎮ㄥ彲浠ュ湪 [妯″瀷璁稿彲鍗忚](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)涓嬭嚜鐢变娇鐢ㄣ�佸鍒躲�佷慨鏀瑰拰鍒嗕韩FunASR妯″瀷锛屼笅闈㈠垪涓句唬琛ㄦ�х殑妯″瀷锛屾洿澶氭ā鍨嬭鍙傝�� [妯″瀷浠撳簱](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo)銆�
 
+<div align="center">  
+<h4>
+ <a href="#妯″瀷鎺ㄧ悊"> 妯″瀷鎺ㄧ悊 </a>   
+锝�<a href="#妯″瀷璁粌涓庢祴璇�"> 妯″瀷璁粌涓庢祴璇� </a>
+锝�<a href="#妯″瀷瀵煎嚭涓庢祴璇�"> 妯″瀷瀵煎嚭涓庢祴璇� </a>
+</h4>
+</div>
 
-## 鎺ㄧ悊
+<a name="妯″瀷鎺ㄧ悊"></a>
+## 妯″瀷鎺ㄧ悊
 
 ### 蹇�熶娇鐢�
-#### [Paraformer 妯″瀷](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+
+鍛戒护琛屾柟寮忚皟鐢細
+```shell
+funasr ++model=paraformer-zh ++vad_model="fsmn-vad" ++punc_model="ct-punc" ++input=asr_example_zh.wav
+```
+
+python浠g爜璋冪敤锛堟帹鑽愶級
+
 ```python
 from funasr import AutoModel
 
@@ -16,17 +31,20 @@
 print(res)
 ```
 
-### 璇︾粏鐢ㄦ硶浠嬬粛
-```python
-model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], **kwargs)
-```
+### 鎺ュ彛璇存槑
+
 #### AutoModel 瀹氫箟
+```python
+model = AutoModel(model=[str], device=[str], ncpu=[int], output_dir=[str], batch_size=[int], hub=[str], **kwargs)
+```
 - `model`(str): [妯″瀷浠撳簱](https://github.com/alibaba-damo-academy/FunASR/tree/main/model_zoo) 涓殑妯″瀷鍚嶇О锛屾垨鏈湴纾佺洏涓殑妯″瀷璺緞
 - `device`(str): `cuda:0`锛堥粯璁pu0锛夛紝浣跨敤 GPU 杩涜鎺ㄧ悊锛屾寚瀹氥�傚鏋滀负`cpu`锛屽垯浣跨敤 CPU 杩涜鎺ㄧ悊
 - `ncpu`(int): `4` 锛堥粯璁わ級锛岃缃敤浜� CPU 鍐呴儴鎿嶄綔骞惰鎬х殑绾跨▼鏁�
 - `output_dir`(str): `None` 锛堥粯璁わ級锛屽鏋滆缃紝杈撳嚭缁撴灉鐨勮緭鍑鸿矾寰�
-- `batch_size`(int): `1` 锛堥粯璁わ級锛岃В鐮佹椂鐨勬壒澶勭悊澶у皬
+- `batch_size`(int): `1` 锛堥粯璁わ級锛岃В鐮佹椂鐨勬壒澶勭悊锛屾牱鏈釜鏁�
+- `hub`(str)锛歚ms`锛堥粯璁わ級锛屼粠modelscope涓嬭浇妯″瀷銆傚鏋滀负`hf`锛屼粠huggingface涓嬭浇妯″瀷銆�
 - `**kwargs`(dict): 鎵�鏈夊湪`config.yaml`涓弬鏁帮紝鍧囧彲浠ョ洿鎺ュ湪姝ゅ鎸囧畾锛屼緥濡傦紝vad妯″瀷涓渶澶у垏鍓查暱搴� `max_single_segment_time=6000` 锛堟绉掞級銆�
+
 #### AutoModel 鎺ㄧ悊
 ```python
 res = model.generate(input=[str], output_dir=[str])
@@ -47,18 +65,143 @@
 - `output_dir`: None 锛堥粯璁わ級锛屽鏋滆缃紝杈撳嚭缁撴灉鐨勮緭鍑鸿矾寰�
 - `**kwargs`(dict): 涓庢ā鍨嬬浉鍏崇殑鎺ㄧ悊鍙傛暟锛屼緥濡傦紝`beam_size=10`锛宍decoding_ctc_weight=0.1`銆�
 
-### onnx涓巐ibtorch瀵煎嚭
+
+### 鏇村鐢ㄦ硶浠嬬粛
+
+
+#### 闈炲疄鏃惰闊宠瘑鍒�
+```python
+from funasr import AutoModel
+# paraformer-zh is a multi-functional asr model
+# use vad, punc, spk or not as you need
+model = AutoModel(model="paraformer-zh",  
+                  vad_model="fsmn-vad", 
+                  vad_kwargs={"max_single_segment_time": 60000},
+                  punc_model="ct-punc", 
+                  # spk_model="cam++"
+                  )
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file, batch_size_s=300, batch_size_threshold_s=60, hotword='榄旀惌')
+print(res)
+```
+娉ㄦ剰锛�
+- 閫氬父妯″瀷杈撳叆闄愬埗鏃堕暱30s浠ヤ笅锛岀粍鍚坄vad_model`鍚庯紝鏀寔浠绘剰鏃堕暱闊抽杈撳叆锛屼笉灞�闄愪簬paraformer妯″瀷锛屾墍鏈夐煶棰戣緭鍏ユā鍨嬪潎鍙互銆�
+- `model`鐩稿叧鐨勫弬鏁板彲浠ョ洿鎺ュ湪`AutoModel`瀹氫箟涓洿鎺ユ寚瀹氾紱涓巂vad_model`鐩稿叧鍙傛暟鍙互閫氳繃`vad_kwargs`鏉ユ寚瀹氾紝绫诲瀷涓篸ict锛涚被浼肩殑鏈塦punc_kwargs`锛宍spk_kwargs`锛�
+- `max_single_segment_time`: 琛ㄧず`vad_model`鏈�澶у垏鍓查煶棰戞椂闀�, 鍗曚綅鏄绉抦s.
+- `batch_size_s` 琛ㄧず閲囩敤鍔ㄦ�乥atch锛宐atch涓�婚煶棰戞椂闀匡紝鍗曚綅涓虹s銆�
+- `batch_size_threshold_s`: 琛ㄧず`vad_model`鍒囧壊鍚庨煶棰戠墖娈垫椂闀胯秴杩� `batch_size_threshold_s`闃堝�兼椂锛屽皢batch_size鏁拌缃负1, 鍗曚綅涓虹s.
+
+寤鸿锛氬綋鎮ㄨ緭鍏ヤ负闀块煶棰戯紝閬囧埌OOM闂鏃讹紝鍥犱负鏄惧瓨鍗犵敤涓庨煶棰戞椂闀垮憟骞虫柟鍏崇郴澧炲姞锛屽垎涓�3绉嶆儏鍐碉細
+- a)鎺ㄧ悊璧峰闃舵锛屾樉瀛樹富瑕佸彇鍐充簬`batch_size_s`锛岄�傚綋鍑忓皬璇ュ�硷紝鍙互鍑忓皯鏄惧瓨鍗犵敤锛�
+- b)鎺ㄧ悊涓棿闃舵锛岄亣鍒癡AD鍒囧壊鐨勯暱闊抽鐗囨锛屾�籺oken鏁板皬浜巂batch_size_s`锛屼粛鐒跺嚭鐜癘OM锛屽彲浠ラ�傚綋鍑忓皬`batch_size_threshold_s`锛岃秴杩囬槇鍊硷紝寮哄埗batch涓�1; 
+- c)鎺ㄧ悊蹇粨鏉熼樁娈碉紝閬囧埌VAD鍒囧壊鐨勯暱闊抽鐗囨锛屾�籺oken鏁板皬浜巂batch_size_s`锛屼笖瓒呰繃闃堝�糮batch_size_threshold_s`锛屽己鍒禸atch涓�1锛屼粛鐒跺嚭鐜癘OM锛屽彲浠ラ�傚綋鍑忓皬`max_single_segment_time`锛屼娇寰梀AD鍒囧壊闊抽鏃堕暱鍙樼煭銆�
+
+#### 瀹炴椂璇煶璇嗗埆
 
 ```python
-res = model.export(type="onnx", quantize=True)
+from funasr import AutoModel
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+
+model = AutoModel(model="paraformer-zh-streaming")
+
+import soundfile
+import os
+
+wav_file = os.path.join(model.model_path, "example/asr_example.wav")
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = chunk_size[1] * 960 # 600ms
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size, encoder_chunk_look_back=encoder_chunk_look_back, decoder_chunk_look_back=decoder_chunk_look_back)
+    print(res)
 ```
-- `type`(str)锛歚onnx`(榛樿)锛屽鍑簅nnx鏍煎紡銆俙torch`瀵煎嚭libtorch鏍煎紡銆�
-- `quantize`(bool)锛歚False`锛堥粯璁わ級锛屾槸鍚﹀仛閲忓寲銆�
 
+娉細`chunk_size`涓烘祦寮忓欢鏃堕厤缃紝`[0,10,5]`琛ㄧず涓婂睆瀹炴椂鍑哄瓧绮掑害涓篳10*60=600ms`锛屾湭鏉ヤ俊鎭负`5*60=300ms`銆傛瘡娆℃帹鐞嗚緭鍏ヤ负`600ms`锛堥噰鏍风偣鏁颁负`16000*0.6=960`锛夛紝杈撳嚭涓哄搴旀枃瀛楋紝鏈�鍚庝竴涓闊崇墖娈佃緭鍏ラ渶瑕佽缃甡is_final=True`鏉ュ己鍒惰緭鍑烘渶鍚庝竴涓瓧銆�
 
-## 寰皟
+#### 璇煶绔偣妫�娴嬶紙闈炲疄鏃讹級
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fsmn-vad")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+res = model.generate(input=wav_file)
+print(res)
+```
+娉細VAD妯″瀷杈撳嚭鏍煎紡涓猴細`[[beg1, end1], [beg2, end2], .., [begN, endN]]`锛屽叾涓璥begN/endN`琛ㄧず绗琡N`涓湁鏁堥煶棰戠墖娈电殑璧峰鐐�/缁撴潫鐐癸紝
+鍗曚綅涓烘绉掋��
+
+#### 璇煶绔偣妫�娴嬶紙瀹炴椂锛�
+```python
+from funasr import AutoModel
+
+chunk_size = 200 # ms
+model = AutoModel(model="fsmn-vad")
+
+import soundfile
+
+wav_file = f"{model.model_path}/example/vad_example.wav"
+speech, sample_rate = soundfile.read(wav_file)
+chunk_stride = int(chunk_size * sample_rate / 1000)
+
+cache = {}
+total_chunk_num = int(len((speech)-1)/chunk_stride+1)
+for i in range(total_chunk_num):
+    speech_chunk = speech[i*chunk_stride:(i+1)*chunk_stride]
+    is_final = i == total_chunk_num - 1
+    res = model.generate(input=speech_chunk, cache=cache, is_final=is_final, chunk_size=chunk_size)
+    if len(res[0]["value"]):
+        print(res)
+```
+娉細娴佸紡VAD妯″瀷杈撳嚭鏍煎紡涓�4绉嶆儏鍐碉細
+- `[[beg1, end1], [beg2, end2], .., [begN, endN]]`锛氬悓涓婄绾縑AD杈撳嚭缁撴灉銆�
+- `[[beg, -1]]`锛氳〃绀哄彧妫�娴嬪埌璧峰鐐广��
+- `[[-1, end]]`锛氳〃绀哄彧妫�娴嬪埌缁撴潫鐐广��
+- `[]`锛氳〃绀烘棦娌℃湁妫�娴嬪埌璧峰鐐癸紝涔熸病鏈夋娴嬪埌缁撴潫鐐�
+杈撳嚭缁撴灉鍗曚綅涓烘绉掞紝浠庤捣濮嬬偣寮�濮嬬殑缁濆鏃堕棿銆�
+
+#### 鏍囩偣鎭㈠
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="ct-punc")
+
+res = model.generate(input="閭d粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�")
+print(res)
+```
+
+#### 鏃堕棿鎴抽娴�
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="fa-zh")
+
+wav_file = f"{model.model_path}/example/asr_example.wav"
+text_file = f"{model.model_path}/example/text.txt"
+res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
+print(res)
+```
+鏇村锛圼绀轰緥](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining)锛�
+
+<a name="鏍稿績鍔熻兘"></a>
+## 妯″瀷璁粌涓庢祴璇�
 
 ### 蹇�熷紑濮�
+
+鍛戒护琛屾墽琛岋紙鐢ㄤ簬蹇�熸祴璇曪紝涓嶆帹鑽愶級锛�
+```shell
+funasr-train ++model=paraformer-zh ++train_data_set_list=data/list/train.jsonl ++valid_data_set_list=data/list/val.jsonl ++output_dir="./outputs" &> log.txt &
+```
+
+python浠g爜鎵ц锛堝彲浠ュ鏈哄鍗★紝鎺ㄨ崘锛�
+
 ```shell
 cd examples/industrial_data_pretraining/paraformer
 bash finetune.sh
@@ -83,6 +226,7 @@
 ++train_conf.validate_interval=2000 \
 ++train_conf.save_checkpoint_interval=2000 \
 ++train_conf.keep_nbest_models=20 \
+++train_conf.avg_nbest_model=5 \
 ++optim_conf.lr=0.0002 \
 ++output_dir="${output_dir}" &> ${log_file}
 ```
@@ -99,6 +243,7 @@
 - `train_conf.validate_interval`锛坕nt锛夛細璁粌涓仛楠岃瘉娴嬭瘯鐨勯棿闅攕tep鏁般��
 - `train_conf.save_checkpoint_interval`锛坕nt锛夛細璁粌涓ā鍨嬩繚瀛橀棿闅攕tep鏁般��
 - `train_conf.keep_nbest_models`锛坕nt锛夛細淇濈暀鏈�澶у灏戜釜妯″瀷鍙傛暟锛屾寜鐓ч獙璇侀泦acc鎺掑簭锛屼粠楂樺埌搴曚繚鐣欍��
+- `train_conf.avg_nbest_model`锛坕nt锛夛細瀵筧cc鏈�楂樼殑n涓ā鍨嬪彇骞冲潎銆�
 - `optim_conf.lr`锛坒loat锛夛細瀛︿範鐜囥��
 - `output_dir`锛坰tr锛夛細妯″瀷淇濆瓨璺緞銆�
 - `**kwargs`(dict): 鎵�鏈夊湪`config.yaml`涓弬鏁帮紝鍧囧彲浠ョ洿鎺ュ湪姝ゅ鎸囧畾锛屼緥濡傦紝杩囨护20s浠ヤ笂闀块煶棰戯細`dataset_conf.max_token_length=2000`锛屽崟浣嶄负闊抽fbank甯ф暟锛�1甯�10ms锛夋垨鑰呮枃瀛梩oken涓暟銆�
@@ -173,6 +318,16 @@
 ++jsonl_file_out="../../../data/list/train.jsonl"
 ```
 
+锛堝彲閫夛紝闈炲繀闇�锛夊鏋滈渶瑕佷粠jsonl瑙f瀽鎴恮av.scp涓巘ext.txt锛屽彲浠ヤ娇鐢ㄦ寚浠わ細
+
+```shell
+# generate wav.scp and text.txt from train.jsonl and val.jsonl
+jsonl2scp \
+++scp_file_list='["../../../data/list/train_wav.scp", "../../../data/list/train_text.txt"]' \
+++data_type_list='["source", "target"]' \
+++jsonl_file_in="../../../data/list/train.jsonl"
+```
+
 #### 鏌ョ湅璁粌鏃ュ織
 
 ##### 鏌ョ湅瀹為獙log
@@ -196,3 +351,35 @@
 tensorboard --logdir /xxxx/FunASR/examples/industrial_data_pretraining/paraformer/outputs/log/tensorboard
 ```
 娴忚鍣ㄤ腑鎵撳紑锛歨ttp://localhost:6006/
+
+
+<a name="妯″瀷瀵煎嚭涓庢祴璇�"></a>
+## 妯″瀷瀵煎嚭涓庢祴璇�
+### 浠庡懡浠よ瀵煎嚭
+```shell
+funasr-export ++model=paraformer ++quantize=false
+```
+
+### 浠嶱ython瀵煎嚭
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="paraformer")
+
+res = model.export(quantize=False)
+```
+
+### 娴嬭瘯ONNX
+```python
+# pip3 install -U funasr-onnx
+from funasr_onnx import Paraformer
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
+```
+
+鏇村渚嬪瓙璇峰弬鑰� [鏍蜂緥](runtime/python/onnxruntime)
\ No newline at end of file

--
Gitblit v1.9.1