From 9c9e02b2a4c54b8f9d1c198e0708ae1803adbd4c Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 23 一月 2024 23:51:17 +0800
Subject: [PATCH] funasr1.0

---
 examples/industrial_data_pretraining/ct_transformer/demo.py            |    4 
 examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh        |    6 
 model_zoo/readme.md                                                    |   32 ++++++++
 examples/industrial_data_pretraining/paraformer/finetune.sh            |    2 
 examples/industrial_data_pretraining/bicif_paraformer/infer.sh         |    4 
 examples/industrial_data_pretraining/emotion2vec/demo.py               |    2 
 examples/industrial_data_pretraining/paraformer/demo.py                |    8 +-
 examples/industrial_data_pretraining/scama/demo.py                     |    2 
 examples/industrial_data_pretraining/emotion2vec/infer.sh              |    2 
 examples/industrial_data_pretraining/paraformer_streaming/demo.py      |    8 +-
 examples/industrial_data_pretraining/paraformer_streaming/infer.sh     |    2 
 examples/industrial_data_pretraining/paraformer/infer.sh               |    2 
 examples/industrial_data_pretraining/ct_transformer_streaming/demo.py  |    3 
 examples/industrial_data_pretraining/paraformer-zh-spk/demo.py         |    6 
 examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py        |    2 
 examples/industrial_data_pretraining/ct_transformer/infer.sh           |    6 
 examples/industrial_data_pretraining/contextual_paraformer/demo.py     |    2 
 examples/industrial_data_pretraining/monotonic_aligner/demo.py         |    2 
 README_zh.md                                                           |   14 +-
 README.md                                                              |   16 ++--
 examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh       |    2 
 examples/industrial_data_pretraining/ct_transformer_streaming/infer.sh |    2 
 model_zoo/readme_zh.md                                                 |   28 +++++++
 examples/industrial_data_pretraining/contextual_paraformer/infer.sh    |    2 
 examples/industrial_data_pretraining/scama/infer.sh                    |    2 
 examples/industrial_data_pretraining/bicif_paraformer/demo.py          |    8 +-
 examples/industrial_data_pretraining/seaco_paraformer/infer.sh         |    6 
 examples/industrial_data_pretraining/seaco_paraformer/demo.py          |   12 +-
 examples/industrial_data_pretraining/monotonic_aligner/infer.sh        |    2 
 29 files changed, 124 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index 7500dd4..1e3707c 100644
--- a/README.md
+++ b/README.md
@@ -91,9 +91,9 @@
 from funasr import AutoModel
 # paraformer-zh is a multi-functional asr model
 # use vad, punc, spk or not as you need
-model = AutoModel(model="paraformer-zh", model_revision="v2.0.2",
-                  vad_model="fsmn-vad", vad_model_revision="v2.0.2",
-                  punc_model="ct-punc-c", punc_model_revision="v2.0.3",
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
                   # spk_model="cam++", spk_model_revision="v2.0.2",
                   )
 res = model.generate(input=f"{model.model_path}/example/asr_example.wav", 
@@ -111,7 +111,7 @@
 encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
 decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
 
-model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
 
 import soundfile
 import os
@@ -134,7 +134,7 @@
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
 wav_file = f"{model.model_path}/example/asr_example.wav"
 res = model.generate(input=wav_file)
 print(res)
@@ -144,7 +144,7 @@
 from funasr import AutoModel
 
 chunk_size = 200 # ms
-model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
 
 import soundfile
 
@@ -165,7 +165,7 @@
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="ct-punc", model_revision="v2.0.2")
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
 res = model.generate(input="閭ｄ粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�")
 print(res)
 ```
@@ -173,7 +173,7 @@
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="fa-zh", model_revision="v2.0.2")
+model = AutoModel(model="fa-zh", model_revision="v2.0.4")
 wav_file = f"{model.model_path}/example/asr_example.wav"
 text_file = f"{model.model_path}/example/text.txt"
 res = model.generate(input=(wav_file, text_file), data_type=("sound", "text"))
diff --git a/README_zh.md b/README_zh.md
index b19e7c2..552a5a0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -87,9 +87,9 @@
 from funasr import AutoModel
 # paraformer-zh is a multi-functional asr model
 # use vad, punc, spk or not as you need
-model = AutoModel(model="paraformer-zh", model_revision="v2.0.2",
-                  vad_model="fsmn-vad", vad_model_revision="v2.0.2",
-                  punc_model="ct-punc-c", punc_model_revision="v2.0.3",
+model = AutoModel(model="paraformer-zh", model_revision="v2.0.4",
+                  vad_model="fsmn-vad", vad_model_revision="v2.0.4",
+                  punc_model="ct-punc-c", punc_model_revision="v2.0.4",
                   # spk_model="cam++", spk_model_revision="v2.0.2",
                   )
 res = model.generate(input=f"{model.model_path}/example/asr_example.wav", 
@@ -108,7 +108,7 @@
 encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
 decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
 
-model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")
+model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.4")
 
 import soundfile
 import os
@@ -132,7 +132,7 @@
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
 
 wav_file = f"{model.model_path}/example/asr_example.wav"
 res = model.generate(input=wav_file)
@@ -144,7 +144,7 @@
 from funasr import AutoModel
 
 chunk_size = 200 # ms
-model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")
+model = AutoModel(model="fsmn-vad", model_revision="v2.0.4")
 
 import soundfile
 
@@ -166,7 +166,7 @@
 ```python
 from funasr import AutoModel
 
-model = AutoModel(model="ct-punc", model_revision="v2.0.2")
+model = AutoModel(model="ct-punc", model_revision="v2.0.4")
 
 res = model.generate(input="閭ｄ粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�")
 print(res)
diff --git a/examples/industrial_data_pretraining/bicif_paraformer/demo.py b/examples/industrial_data_pretraining/bicif_paraformer/demo.py
index f1b1496..9010200 100644
--- a/examples/industrial_data_pretraining/bicif_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/bicif_paraformer/demo.py
@@ -6,13 +6,13 @@
 from funasr import AutoModel
 
 model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-                  model_revision="v2.0.2",
+                  model_revision="v2.0.4",
                   vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                  vad_model_revision="v2.0.2",
+                  vad_model_revision="v2.0.4",
                   punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-                  punc_model_revision="v2.0.3",
+                  punc_model_revision="v2.0.4",
                   spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
-                  spk_model_revision="v2.0.2",
+                  spk_model_revision="v2.0.4",
                   )
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60)
diff --git a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
index 55efdf2..9770138 100644
--- a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
@@ -1,8 +1,8 @@
 
 model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-vad_model_revision="v2.0.2"
+vad_model_revision="v2.0.4"
 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
 punc_model_revision="v2.0.3"
 spk_model="damo/speech_campplus_sv_zh-cn_16k-common"
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo.py b/examples/industrial_data_pretraining/contextual_paraformer/demo.py
index d1378ca..38d7fbf 100644
--- a/examples/industrial_data_pretraining/contextual_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/contextual_paraformer/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.2")
+model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.4")
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
             hotword='杈炬懇闄� 榄旀惌')
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
index b20742b..8fc66f3 100644
--- a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/ct_transformer/demo.py b/examples/industrial_data_pretraining/ct_transformer/demo.py
index f547f03..1aed786 100644
--- a/examples/industrial_data_pretraining/ct_transformer/demo.py
+++ b/examples/industrial_data_pretraining/ct_transformer/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.2")
+model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.4")
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
 print(res)
@@ -13,7 +13,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.2")
+model = AutoModel(model="damo/punc_ct-transformer_cn-en-common-vocab471067-large", model_revision="v2.0.4")
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
 print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/ct_transformer/infer.sh b/examples/industrial_data_pretraining/ct_transformer/infer.sh
index 33bb5c1..fcfa07c 100644
--- a/examples/industrial_data_pretraining/ct_transformer/infer.sh
+++ b/examples/industrial_data_pretraining/ct_transformer/infer.sh
@@ -1,9 +1,9 @@
 
-model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-model_revision="v2.0.2"
+#model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+#model_revision="v2.0.4"
 
 model="damo/punc_ct-transformer_cn-en-common-vocab471067-large"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py b/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py
index 081fd19..e653c1a 100644
--- a/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py
+++ b/examples/industrial_data_pretraining/ct_transformer_streaming/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", model_revision="v2.0.1")
+model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", model_revision="v2.0.4")
 
 inputs = "璺ㄥ娌虫祦鏄吇鑲叉部宀竱浜烘皯鐨勭敓鍛戒箣婧愰暱鏈熶互鏉ヤ负甯姪涓嬫父鍦板尯闃茬伨鍑忕伨涓柟鎶�鏈汉鍛榺鍦ㄤ笂娓稿湴鍖烘瀬涓烘伓鍔ｇ殑鑷劧鏉′欢涓嬪厠鏈嶅法澶у洶闅剧敋鑷冲啋鐫�鐢熷懡鍗遍櫓|鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏|鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒秥鍑℃槸|涓柟鑳藉仛鐨勬垜浠瑋閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑|浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛瑙勫垝鍜岃璇佸吋椤句笂涓嬫父鐨勫埄鐩�"
 vads = inputs.split("|")
@@ -13,7 +13,6 @@
 cache = {}
 for vad in vads:
     rec_result = model.generate(input=vad, cache=cache)
-    print(rec_result)
     rec_result_all += rec_result[0]['text']
 
 print(rec_result_all)
diff --git a/examples/industrial_data_pretraining/ct_transformer_streaming/infer.sh b/examples/industrial_data_pretraining/ct_transformer_streaming/infer.sh
index fa92a6e..b070852 100644
--- a/examples/industrial_data_pretraining/ct_transformer_streaming/infer.sh
+++ b/examples/industrial_data_pretraining/ct_transformer_streaming/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
-model_revision="v2.0.1"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/emotion2vec/demo.py b/examples/industrial_data_pretraining/emotion2vec/demo.py
index a41641e..3841089 100644
--- a/examples/industrial_data_pretraining/emotion2vec/demo.py
+++ b/examples/industrial_data_pretraining/emotion2vec/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.1")
+model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.4")
 
 wav_file = f"{model.model_path}/example/test.wav"
 res = model.generate(wav_file, output_dir="./outputs", granularity="utterance")
diff --git a/examples/industrial_data_pretraining/emotion2vec/infer.sh b/examples/industrial_data_pretraining/emotion2vec/infer.sh
index 9b98715..c46b819 100644
--- a/examples/industrial_data_pretraining/emotion2vec/infer.sh
+++ b/examples/industrial_data_pretraining/emotion2vec/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/emotion2vec_base"
-model_revision="v2.0.0"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py b/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
index 8084dec..ace3a94 100644
--- a/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
+++ b/examples/industrial_data_pretraining/fsmn_vad_streaming/demo.py
@@ -7,7 +7,7 @@
 wav_file = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav"
 
 chunk_size = 60000 # ms
-model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.2")
+model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.4")
 
 res = model.generate(input=wav_file, chunk_size=chunk_size, )
 print(res)
diff --git a/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh b/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh
index 815c52a..a003fa5 100644
--- a/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh
+++ b/examples/industrial_data_pretraining/fsmn_vad_streaming/infer.sh
@@ -1,7 +1,7 @@
 
 
 model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/monotonic_aligner/demo.py b/examples/industrial_data_pretraining/monotonic_aligner/demo.py
index cad9aab..8e71c90 100644
--- a/examples/industrial_data_pretraining/monotonic_aligner/demo.py
+++ b/examples/industrial_data_pretraining/monotonic_aligner/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.2")
+model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.4")
 
 res = model.generate(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
                    "娆㈣繋澶у鏉ュ埌榄旀惌绀惧尯杩涜浣撻獙"),
diff --git a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
index dcc8722..0320922 100644
--- a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
+++ b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/speech_timestamp_prediction-v1-16k-offline"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py
index e17a831..1149aad 100644
--- a/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py
+++ b/examples/industrial_data_pretraining/paraformer-zh-spk/demo.py
@@ -6,11 +6,11 @@
 from funasr import AutoModel
 
 model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-                  model_revision="v2.0.2",
+                  model_revision="v2.0.4",
                   vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                  vad_model_revision="v2.0.2",
+                  vad_model_revision="v2.0.4",
                   punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-                  punc_model_revision="v2.0.3",
+                  punc_model_revision="v2.0.4",
                   spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
                   spk_model_revision="v2.0.2"
                   )
diff --git a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh
index b8610cb..319a7c9 100644
--- a/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh
+++ b/examples/industrial_data_pretraining/paraformer-zh-spk/infer.sh
@@ -1,10 +1,10 @@
 
 model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-vad_model_revision="v2.0.2"
+vad_model_revision="v2.0.4"
 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-punc_model_revision="v2.0.3"
+punc_model_revision="v2.0.4"
 spk_model="damo/speech_campplus_sv_zh-cn_16k-common"
 spk_model_revision="v2.0.2"
 
diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py
index 724191d..a0c7406 100644
--- a/examples/industrial_data_pretraining/paraformer/demo.py
+++ b/examples/industrial_data_pretraining/paraformer/demo.py
@@ -5,11 +5,11 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.3",
+model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.4",
                   # vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                  # vad_model_revision="v2.0.2",
+                  # vad_model_revision="v2.0.4",
                   # punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-                  # punc_model_revision="v2.0.3",
+                  # punc_model_revision="v2.0.4",
                   )
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
@@ -19,7 +19,7 @@
 ''' can not use currently
 from funasr import AutoFrontend
 
-frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2")
+frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.4")
 
 fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
 
diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh
index 1aff068..7d4ea94 100644
--- a/examples/industrial_data_pretraining/paraformer/finetune.sh
+++ b/examples/industrial_data_pretraining/paraformer/finetune.sh
@@ -8,7 +8,7 @@
 
 python funasr/bin/train.py \
 +model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-+model_revision="v2.0.2" \
++model_revision="v2.0.4" \
 +train_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len_10.jsonl" \
 +valid_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len_10.jsonl" \
 ++dataset_conf.batch_size=64 \
diff --git a/examples/industrial_data_pretraining/paraformer/infer.sh b/examples/industrial_data_pretraining/paraformer/infer.sh
index 6d3732f..7491e98 100644
--- a/examples/industrial_data_pretraining/paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/paraformer/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/paraformer_streaming/demo.py b/examples/industrial_data_pretraining/paraformer_streaming/demo.py
index 6898030..455fe84 100644
--- a/examples/industrial_data_pretraining/paraformer_streaming/demo.py
+++ b/examples/industrial_data_pretraining/paraformer_streaming/demo.py
@@ -5,11 +5,11 @@
 
 from funasr import AutoModel
 
-chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
-encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
-decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention
+chunk_size = [5, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+encoder_chunk_look_back = 0 #number of chunks to lookback for encoder self-attention
+decoder_chunk_look_back = 0 #number of encoder chunks to lookback for decoder cross-attention
 
-model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.2")
+model = AutoModel(model="damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online", model_revision="v2.0.4")
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
             chunk_size=chunk_size,
             encoder_chunk_look_back=encoder_chunk_look_back,
diff --git a/examples/industrial_data_pretraining/paraformer_streaming/infer.sh b/examples/industrial_data_pretraining/paraformer_streaming/infer.sh
index 225f2a9..edb7196 100644
--- a/examples/industrial_data_pretraining/paraformer_streaming/infer.sh
+++ b/examples/industrial_data_pretraining/paraformer_streaming/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/scama/demo.py b/examples/industrial_data_pretraining/scama/demo.py
index c805993..1999623 100644
--- a/examples/industrial_data_pretraining/scama/demo.py
+++ b/examples/industrial_data_pretraining/scama/demo.py
@@ -9,7 +9,7 @@
 encoder_chunk_look_back = 0 #number of chunks to lookback for encoder self-attention
 decoder_chunk_look_back = 0 #number of encoder chunks to lookback for decoder cross-attention
 
-model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/speech_SCAMA_asr-zh-cn-16k-common-vocab8358-streaming", model_revision="v2.0.2")
+model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/speech_SCAMA_asr-zh-cn-16k-common-vocab8358-streaming", model_revision="v2.0.4")
 cache = {}
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
             chunk_size=chunk_size,
diff --git a/examples/industrial_data_pretraining/scama/infer.sh b/examples/industrial_data_pretraining/scama/infer.sh
index 225f2a9..edb7196 100644
--- a/examples/industrial_data_pretraining/scama/infer.sh
+++ b/examples/industrial_data_pretraining/scama/infer.sh
@@ -1,6 +1,6 @@
 
 model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
index a202956..8385ccc 100644
--- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
@@ -5,14 +5,14 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-                  model_revision="v2.0.2",
+model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                  model_revision="v2.0.4",
                   vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                  vad_model_revision="v2.0.2",
+                  vad_model_revision="v2.0.4",
                   punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
-                  punc_model_revision="v2.0.3",
-                  spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
-                  spk_model_revision="v2.0.2",
+                  punc_model_revision="v2.0.4",
+                  # spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                  # spk_model_revision="v2.0.2",
                   )
 
 res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
diff --git a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
index f335684..76f52ef 100644
--- a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
@@ -1,10 +1,10 @@
 
 model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-model_revision="v2.0.2"
+model_revision="v2.0.4"
 vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
-vad_model_revision="v2.0.2"
+vad_model_revision="v2.0.4"
 punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-punc_model_revision="v2.0.3"
+punc_model_revision="v2.0.4"
 
 python funasr/bin/inference.py \
 +model=${model} \
diff --git a/model_zoo/readme.md b/model_zoo/readme.md
new file mode 100644
index 0000000..133307b
--- /dev/null
+++ b/model_zoo/readme.md
@@ -0,0 +1,32 @@
+([绠�浣撲腑鏂嘳(./readme_zh.md)|English)
+
+# Model Zoo
+
+## Model License
+You are free to use, copy, modify, and share FunASR models under the conditions of this agreement. You should indicate the model source and author information when using, copying, modifying and sharing FunASR models. You should keep the relevant names of models in [FunASR software]. Full model license could see [license](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)
+
+## Model Usage
+Ref to [docs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)
+
+## Model Zoo
+Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
+
+### Speech Recognition
+#### Paraformer
+
+
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo]().
+
+(Note: 馃 represents the Huggingface model zoo link, 猸� represents the ModelScope model zoo link)
+
+
+|                                                                             Model Name                                                                             |                                Task Details                                 |          Training Data           | Parameters |
+|:------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------:|:--------------------------------:|:----------:|
+|    paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() )    |             speech recognition, with timestamps, non-streaming              |      60000 hours, Mandarin       |    220M    |
+|                paraformer-zh-spk <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [馃]() )                | speech recognition with speaker diarization, with timestamps, non-streaming |      60000 hours, Mandarin       |    220M    |
+| <nobr>paraformer-zh-online <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() )</nobr> |                        speech recognition, streaming                        |      60000 hours, Mandarin       |    220M    |
+|         paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃]() )         |             speech recognition, with timestamps, non-streaming              |       50000 hours, English       |    220M    |
+|                     conformer-en <br> ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃]() )                      |                      speech recognition, non-streaming                      |       50000 hours, English       |    220M    |
+|                     ct-punc <br> ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃]() )                      |                           punctuation restoration                           |    100M, Mandarin and English    |    1.1G    | 
+|                          fsmn-vad <br> ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃]() )                          |                          voice activity detection                           | 5000 hours, Mandarin and English |    0.4M    | 
+|                          fa-zh <br> ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃]() )                           |                            timestamp prediction                             |       5000 hours, Mandarin       |    38M     | 
diff --git a/model_zoo/readme_zh.md b/model_zoo/readme_zh.md
new file mode 100644
index 0000000..657b4f1
--- /dev/null
+++ b/model_zoo/readme_zh.md
@@ -0,0 +1,28 @@
+(绠�浣撲腑鏂噟[English](./readme.md))
+
+# 妯″瀷浠撳簱
+
+## 妯″瀷璁稿彲鍗忚
+鎮ㄥ彲浠ュ湪鏈崗璁殑鏉′欢涓嬭嚜鐢变娇鐢ㄣ�佸鍒躲�佷慨鏀瑰拰鍒嗕韩FunASR妯″瀷銆傚湪浣跨敤銆佸鍒躲�佷慨鏀瑰拰鍒嗕韩FunASR妯″瀷鏃讹紝鎮ㄥ簲褰撴爣鏄庢ā鍨嬫潵婧愬拰浣滆�呬俊鎭�傛偍搴斿綋鍦╗FunASR杞欢]涓繚鐣欑浉鍏虫ā鍨嬬殑鍚嶇О銆傚畬鏁寸殑妯″瀷璁稿彲璇佽鍙傝 [妯″瀷璁稿彲鍗忚](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)
+
+## 妯″瀷鐢ㄦ硶
+妯″瀷鐢ㄦ硶鍙傝�僛鏂囨。](funasr/quick_start_zh.md)
+
+## 妯″瀷浠撳簱
+杩欓噷鎴戜滑鎻愪緵浜嗗湪涓嶅悓鏁版嵁闆嗕笂棰勮缁冪殑妯″瀷銆傛ā鍨嬪拰鏁版嵁闆嗙殑璇︾粏淇℃伅鍙湪 [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)涓壘鍒�.
+
+### 璇煶璇嗗埆妯″瀷
+#### Paraformer妯″瀷
+
+锛堟敞锛歔馃]()琛ㄧずHuggingface妯″瀷浠撳簱閾炬帴锛孾猸怾()琛ㄧずModelScope妯″瀷浠撳簱閾炬帴锛�
+
+|                                                                              妯″瀷鍚嶅瓧                                                                              |         浠诲姟璇︽儏          |     璁粌鏁版嵁     | 鍙傛暟閲�  |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------:|:------------:|:----:|
+|      paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() )       |    璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃�    |  60000灏忔椂锛屼腑鏂�  | 220M |
+| SeACoParaformer-zh <br> ( [猸怾(https://www.modelscope.cn/models/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() ) | 甯︾儹璇嶅姛鑳界殑璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |  60000灏忔椂锛屼腑鏂�  | 220M |
+|              paraformer-zh-spk <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [馃]() )              |  鍒嗚鑹茶闊宠瘑鍒紝甯︽椂闂存埑杈撳嚭锛岄潪瀹炴椂   |  60000灏忔椂锛屼腑鏂�  | 220M |
+|    paraformer-zh-streaming <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() )    |        璇煶璇嗗埆锛屽疄鏃�        |  60000灏忔椂锛屼腑鏂�  | 220M |
+| paraformer-zh-streaming-small <br> ( [猸怾(https://www.modelscope.cn/models/iic/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() ) |        璇煶璇嗗埆锛屽疄鏃�        |  60000灏忔椂锛屼腑鏂�  | 220M |
+
+|       paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃]() )       |       璇煶璇嗗埆锛岄潪瀹炴椂        |  50000灏忔椂锛岃嫳鏂�  | 220M |
+

--
Gitblit v1.9.1