From 8f41477f1e3f3e284cc4e4059aad08331bc1f53f Mon Sep 17 00:00:00 2001
From: lyblsgo <wucong.lyb@alibaba-inc.com>
Date: 星期四, 09 十一月 2023 14:15:00 +0800
Subject: [PATCH] Merge remote-tracking branch 'origin/main'

---
 funasr/version.txt                                                |    2 
 runtime/onnxruntime/bin/funasr-onnx-2pass.cpp                     |    2 
 runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py          |   12 ++
 runtime/python/onnxruntime/setup.py                               |    2 
 README_zh.md                                                      |   27 ++--
 funasr/quick_start_zh.md                                          |    6 
 runtime/onnxruntime/src/paraformer-online.h                       |    2 
 runtime/onnxruntime/src/tokenizer.cpp                             |    8 +
 README.md                                                         |  101 ++++++++++++++++---
 runtime/onnxruntime/src/paraformer-online.cpp                     |    2 
 runtime/onnxruntime/src/tokenizer.h                               |    4 
 runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py |   51 ++++++++++
 funasr/bin/build_trainer.py                                       |    7 +
 funasr/quick_start.md                                             |    6 
 runtime/onnxruntime/src/funasrruntime.cpp                         |    4 
 funasr/__init__.py                                                |    5 
 runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp                 |    2 
 egs/aishell/transformer/utils/apply_cmvn.sh                       |   15 ++
 18 files changed, 200 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index 1a1b334..3f6b434 100644
--- a/README.md
+++ b/README.md
@@ -9,31 +9,32 @@
     <a href=""><img src="https://img.shields.io/badge/Pytorch-%3E%3D1.11-blue"></a>
 </p>
 
-<strong>FunASR</strong> hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun锛�
+<strong>FunASR</strong> hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun锛�
 
 [**Highlights**](#highlights)
 | [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
 | [**Installation**](#installation)
 | [**Quick Start**](#quick-start)
 | [**Runtime**](./runtime/readme.md)
-| [**Model Zoo**](./docs/model_zoo/modelscope_models.md)
+| [**Model Zoo**](#model-zoo)
 | [**Contact**](#contact)
 
 
 <a name="highlights"></a>
 ## Highlights
 - FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker Diarization and multi-talker ASR. FunASR provides convenient scripts and tutorials, supporting inference and fine-tuning of pre-trained models.
-- We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the [service deployment document](funasr/runtime/readme_cn.md). 
+- We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) and [huggingface](https://huggingface.co/FunASR), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the [service deployment document](runtime/readme_cn.md). 
 
 
 <a name="whats-new"></a>
 ## What's new: 
-- 2023/10/17: The offline file transcription service (CPU) of English has been released. For more details, please refer to ([Deployment documentation](funasr/runtime/docs/SDK_tutorial_en.md)).
+- 2023/11/08: The offline file transcription service 3.0 (CPU) of Mandarin has been released, adding punctuation large model, Ngram language model, and wfst hot words. For detailed information, please refer to [docs](runtime#file-transcription-service-mandarin-cpu). 
+- 2023/10/17: The offline file transcription service (CPU) of English has been released. For more details, please refer to ([docs](runtime#file-transcription-service-english-cpu)).
 - 2023/10/13: [SlideSpeech](https://slidespeech.github.io/): A large scale multi-modal audio-visual corpus with a significant amount of real-time synchronized slides.
 - 2023/10/10: The ASR-SpeakersDiarization combined pipeline [Paraformer-VAD-SPK](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py) is now released. Experience the model to get recognition results with speaker information.
 - 2023/10/07: [FunCodec](https://github.com/alibaba-damo-academy/FunCodec): A Fundamental, Reproducible and Integrable Open-source Toolkit for Neural Speech Codec.
-- 2023/09/01: The offline file transcription service 2.0 (CPU) of Mandarin has been released, with added support for ffmpeg, timestamp, and hotword models. For more details, please refer to ([Deployment documentation](funasr/runtime/docs/SDK_tutorial.md)).
-- 2023/08/07: The real-time transcription service (CPU) of Mandarin has been released. For more details, please refer to ([Deployment documentation](funasr/runtime/docs/SDK_tutorial_online.md)).
+- 2023/09/01: The offline file transcription service 2.0 (CPU) of Mandarin has been released, with added support for ffmpeg, timestamp, and hotword models. For more details, please refer to ([docs](runtime#file-transcription-service-mandarin-cpu)).
+- 2023/08/07: The real-time transcription service (CPU) of Mandarin has been released. For more details, please refer to ([docs](runtime#the-real-time-transcription-service-mandarin-cpu)).
 - 2023/07/17: BAT is released, which is a low-latency and low-memory-consumption RNN-T model. For more details, please refer to ([BAT](egs/aishell/bat)).
 - 2023/06/26: ASRU2023 Multi-Channel Multi-Party Meeting Transcription Challenge 2.0 completed the competition and announced the results. For more details, please refer to ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)).
 
@@ -43,19 +44,89 @@
 
 Please ref to [installation docs](https://alibaba-damo-academy.github.io/FunASR/en/installation/installation.html)
 
-## Deployment Service
+## Model Zoo
+FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo]().
 
-FunASR supports pre-trained or further fine-tuned models for deployment as a service. The CPU version of the Chinese offline file conversion service has been released, details can be found in [docs](funasr/runtime/docs/SDK_tutorial.md). More detailed information about service deployment can be found in the [deployment roadmap](funasr/runtime/readme_cn.md).
+(Note: 馃 represents the Huggingface model zoo link, 猸� represents the ModelScope model zoo link)
+
+
+|                                                                              Model Name                                                                              |                                Task Details                                 |          Training Date           | Parameters |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------:|:--------------------------------:|:----------:|
+| <nobr>paraformer-zh ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() )</nobr> |             speech recognition, with timestamps, non-streaming              |      60000 hours, Mandarin       |    220M    |
+|             <nobr>paraformer-zh-spk ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [馃]() )</nobr>             | speech recognition with speaker diarization, with timestamps, non-streaming |      60000 hours, Mandarin       |    220M    |
+|    <nobr>paraformer-zh-online ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() )</nobr>     |                      speech recognition, non-streaming                      |      60000 hours, Mandarin       |    220M    |
+|      <nobr>paraformer-en ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃]() )</nobr>      |             speech recognition, with timestamps, non-streaming              |       50000 hours, English       |    220M    |
+|                                                            <nobr>paraformer-en-spk ([馃]() [猸怾() )</nobr>                                                            |         speech recognition with speaker diarization, non-streaming          |       50000 hours, English       |    220M    |
+|                  <nobr>conformer-en ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃]() )</nobr>                   |                      speech recognition, non-streaming                      |       50000 hours, English       |    220M    |
+|                  <nobr>ct-punc ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃]() )</nobr>                   |                           punctuation restoration                           |    100M, Mandarin and English    |    1.1G    | 
+|                       <nobr>fsmn-vad ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃]() )</nobr>                       |                          voice activity detection                           | 5000 hours, Mandarin and English |    0.4M    | 
+|                       <nobr>fa-zh ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃]() )</nobr>                        |                            timestamp prediction                             |       5000 hours, Mandarin       |    38M     | 
+
+
+
+
+[//]: # ()
+[//]: # (FunASR supports pre-trained or further fine-tuned models for deployment as a service. The CPU version of the Chinese offline file conversion service has been released, details can be found in [docs]&#40;funasr/runtime/docs/SDK_tutorial.md&#41;. More detailed information about service deployment can be found in the [deployment roadmap]&#40;funasr/runtime/readme_cn.md&#41;.)
 
 
 <a name="quick-start"></a>
 ## Quick Start
 Quick start for new users锛圼tutorial](https://alibaba-damo-academy.github.io/FunASR/en/funasr/quick_start.html)锛�
 
+FunASR supports inference and fine-tuning of models trained on industrial data for tens of thousands of hours. For more details, please refer to [modelscope_egs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html). It also supports training and fine-tuning of models on academic standard datasets. For more information, please refer to [egs](https://alibaba-damo-academy.github.io/FunASR/en/academic_recipe/asr_recipe.html).
 
-FunASR supports inference and fine-tuning of models trained on industrial datasets of tens of thousands of hours. For more details, please refer to ([modelscope_egs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)). It also supports training and fine-tuning of models on academic standard datasets. For more details, please refer to([egs](https://alibaba-damo-academy.github.io/FunASR/en/academic_recipe/asr_recipe.html)). The models include speech recognition (ASR), speech activity detection (VAD), punctuation recovery, language model, speaker verification, speaker separation, and multi-party conversation speech recognition. For a detailed list of models, please refer to the [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md):
+Below is a quick start tutorial. Test audio files ([Mandarin](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav), [English]()).
+### Speech Recognition (Non-streaming)
+```python
+from funasr import infer
 
-<a name="Community Communication"></a>
+p = infer(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc", model_hub="ms")
+
+res = p("asr_example_zh.wav", batch_size_token=5000)
+print(res)
+```
+Note: `model_hub`: represents the model repository, `ms` stands for selecting ModelScope download, `hf` stands for selecting Huggingface download.
+
+### Speech Recognition (Streaming)
+```python
+from funasr import infer
+
+p = infer(model="paraformer-zh-streaming", model_hub="ms")
+
+chunk_size = [0, 10, 5] #[0, 10, 5] 600ms, [0, 8, 4] 480ms
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size, "encoder_chunk_look_back": 4, "decoder_chunk_look_back": 1}
+
+import torchaudio
+speech = torchaudio.load("asr_example_zh.wav")[0][0]
+speech_length = speech.shape[0]
+
+stride_size = chunk_size[1] * 960
+sample_offset = 0
+for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+    param_dict["is_final"] = True if sample_offset + stride_size >= speech_length - 1 else False
+    input = speech[sample_offset: sample_offset + stride_size]
+    rec_result = p(input=input, param_dict=param_dict)
+    print(rec_result)
+```
+Note: `chunk_size` is the configuration for streaming latency.` [0,10,5]` indicates that the real-time display granularity is `10*60=600ms`, and the lookahead information is `5*60=300ms`. Each inference input is `600ms` (sample points are `16000*0.6=960`), and the output is the corresponding text. For the last speech segment input, `is_final=True` needs to be set to force the output of the last word.
+
+Quick start for new users can be found in [docs](https://alibaba-damo-academy.github.io/FunASR/en/funasr/quick_start_zh.html)
+
+
+[//]: # (FunASR supports inference and fine-tuning of models trained on industrial datasets of tens of thousands of hours. For more details, please refer to &#40;[modelscope_egs]&#40;https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html&#41;&#41;. It also supports training and fine-tuning of models on academic standard datasets. For more details, please refer to&#40;[egs]&#40;https://alibaba-damo-academy.github.io/FunASR/en/academic_recipe/asr_recipe.html&#41;&#41;. The models include speech recognition &#40;ASR&#41;, speech activity detection &#40;VAD&#41;, punctuation recovery, language model, speaker verification, speaker separation, and multi-party conversation speech recognition. For a detailed list of models, please refer to the [Model Zoo]&#40;https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md&#41;:)
+
+## Deployment Service
+FunASR supports deploying pre-trained or further fine-tuned models for service. Currently, it supports the following types of service deployment:
+- File transcription service, Mandarin, CPU version, done
+- The real-time transcription service, Mandarin (CPU), done
+- File transcription service, English, CPU version, done
+- File transcription service, Mandarin, GPU version, in progress
+- and more.
+
+For more detailed information, please refer to the [service deployment documentation](runtime/readme.md).
+
+
+<a name="contact"></a>
 ## Community Communication
 If you encounter problems in use, you can directly raise Issues on the github page.
 
@@ -67,8 +138,8 @@
 
 ## Contributors
 
-| <div align="left"><img src="docs/images/damo.png" width="180"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div>  | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
-|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
+| <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div>  | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
+|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
 
 The contributors can be found in [contributors list](./Acknowledge.md)
 
@@ -90,12 +161,6 @@
   title={BAT: Boundary aware transducer for memory-efficient and low-latency ASR},
   year={2023},
   booktitle={INTERSPEECH},
-}
-@inproceedings{wang2023told,
-  author={Jiaming Wang and Zhihao Du and Shiliang Zhang},
-  title={{TOLD:} {A} Novel Two-Stage Overlap-Aware Framework for Speaker Diarization},
-  year={2023},
-  booktitle={ICASSP},
 }
 @inproceedings{gao22b_interspeech,
   author={Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan},
diff --git a/README_zh.md b/README_zh.md
index c71b984..554c0b6 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -18,7 +18,7 @@
 锝�<a href="#瀹夎鏁欑▼"> 瀹夎 </a>
 锝�<a href="#蹇�熷紑濮�"> 蹇�熷紑濮� </a>
 锝�<a href="https://alibaba-damo-academy.github.io/FunASR/en/index.html"> 鏁欑▼鏂囨。 </a>
-锝�<a href="./docs/model_zoo/modelscope_models.md"> 妯″瀷浠撳簱 </a>
+锝�<a href="#妯″瀷浠撳簱"> 妯″瀷浠撳簱 </a>
 锝�<a href="#鏈嶅姟閮ㄧ讲"> 鏈嶅姟閮ㄧ讲 </a>
 锝�<a href="#鑱旂郴鎴戜滑"> 鑱旂郴鎴戜滑 </a>
 </h4>
@@ -27,10 +27,11 @@
 <a name="鏍稿績鍔熻兘"></a>
 ## 鏍稿績鍔熻兘
 - FunASR鏄竴涓熀纭�璇煶璇嗗埆宸ュ叿鍖咃紝鎻愪緵澶氱鍔熻兘锛屽寘鎷闊宠瘑鍒紙ASR锛夈�佽闊崇鐐规娴嬶紙VAD锛夈�佹爣鐐规仮澶嶃�佽瑷�妯″瀷銆佽璇濅汉楠岃瘉銆佽璇濅汉鍒嗙鍜屽浜哄璇濊闊宠瘑鍒瓑銆侳unASR鎻愪緵浜嗕究鎹风殑鑴氭湰鍜屾暀绋嬶紝鏀寔棰勮缁冨ソ鐨勬ā鍨嬬殑鎺ㄧ悊涓庡井璋冦��
-- 鎴戜滑鍦╗ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)涓嶽huggingface](https://huggingface.co/FunAudio)涓婂彂甯冧簡澶ч噺寮�婧愭暟鎹泦鎴栬�呮捣閲忓伐涓氭暟鎹缁冪殑妯″瀷锛屽彲浠ラ�氳繃鎴戜滑鐨刐妯″瀷浠撳簱](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)浜嗚В妯″瀷鐨勮缁嗕俊鎭�備唬琛ㄦ�х殑[Paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)闈炶嚜鍥炲綊绔埌绔闊宠瘑鍒ā鍨嬪叿鏈夐珮绮惧害銆侀珮鏁堢巼銆佷究鎹烽儴缃茬殑浼樼偣锛屾敮鎸佸揩閫熸瀯寤鸿闊宠瘑鍒湇鍔★紝璇︾粏淇℃伅鍙互闃呰([鏈嶅姟閮ㄧ讲鏂囨。](funasr/runtime/readme_cn.md))銆�
+- 鎴戜滑鍦╗ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)涓嶽huggingface](https://huggingface.co/FunASR)涓婂彂甯冧簡澶ч噺寮�婧愭暟鎹泦鎴栬�呮捣閲忓伐涓氭暟鎹缁冪殑妯″瀷锛屽彲浠ラ�氳繃鎴戜滑鐨刐妯″瀷浠撳簱](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)浜嗚В妯″瀷鐨勮缁嗕俊鎭�備唬琛ㄦ�х殑[Paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)闈炶嚜鍥炲綊绔埌绔闊宠瘑鍒ā鍨嬪叿鏈夐珮绮惧害銆侀珮鏁堢巼銆佷究鎹烽儴缃茬殑浼樼偣锛屾敮鎸佸揩閫熸瀯寤鸿闊宠瘑鍒湇鍔★紝璇︾粏淇℃伅鍙互闃呰([鏈嶅姟閮ㄧ讲鏂囨。](runtime/readme_cn.md))銆�
 
 <a name="鏈�鏂板姩鎬�"></a>
 ## 鏈�鏂板姩鎬�
+- 2023/11/08锛氫腑鏂囩绾挎枃浠惰浆鍐欐湇鍔�3.0 CPU鐗堟湰鍙戝竷锛屾柊澧炴爣鐐瑰ぇ妯″瀷銆丯gram璇█妯″瀷涓巜fst鐑瘝锛岃缁嗕俊鎭弬闃�([涓�閿儴缃叉枃妗(runtime/readme_cn.md#涓枃绂荤嚎鏂囦欢杞啓鏈嶅姟cpu鐗堟湰))
 - 2023/10/17: 鑻辨枃绂荤嚎鏂囦欢杞啓鏈嶅姟涓�閿儴缃茬殑CPU鐗堟湰鍙戝竷锛岃缁嗕俊鎭弬闃�([涓�閿儴缃叉枃妗(runtime/readme_cn.md#鑻辨枃绂荤嚎鏂囦欢杞啓鏈嶅姟cpu鐗堟湰))
 - 2023/10/13: [SlideSpeech](https://slidespeech.github.io/): 涓�涓ぇ瑙勬ā鐨勫妯℃�侀煶瑙嗛璇枡搴擄紝涓昏鏄湪绾夸細璁垨鑰呭湪绾胯绋嬪満鏅紝鍖呭惈浜嗗ぇ閲忎笌鍙戣█浜鸿璇濆疄鏃跺悓姝ョ殑骞荤伅鐗囥��
 - 2023.10.10: [Paraformer-long-Spk](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py)妯″瀷鍙戝竷锛屾敮鎸佸湪闀胯闊宠瘑鍒殑鍩虹涓婅幏鍙栨瘡鍙ヨ瘽鐨勮璇濅汉鏍囩銆�
@@ -51,17 +52,17 @@
 锛堟敞锛歔馃]()琛ㄧずHuggingface妯″瀷浠撳簱閾炬帴锛孾猸怾()琛ㄧずModelScope妯″瀷浠撳簱閾炬帴锛�
 
 
-|                                                                          妯″瀷鍚嶅瓧                                                                           |        浠诲姟璇︽儏        |     璁粌鏁版嵁     | 鍙傛暟閲�  |
-|:-------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:|
-| paraformer-zh ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() ) |  璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃�   |  60000灏忔椂锛屼腑鏂�  | 220M |
-|                 paraformer-zh-spk ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) )                 | 鍒嗚鑹茶闊宠瘑鍒紝甯︽椂闂存埑杈撳嚭锛岄潪瀹炴椂 |  60000灏忔椂锛屼腑鏂�  | 220M |
-|    paraformer-zh-online ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() )     |      璇煶璇嗗埆锛屽疄鏃�       |  60000灏忔椂锛屼腑鏂�  | 220M |
-|      paraformer-en ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃]() )      | 鍒嗚鑹茶闊宠瘑鍒紝甯︽椂闂存埑杈撳嚭锛岄潪瀹炴椂 |  50000灏忔椂锛岃嫳鏂�  | 220M |
-|                                                            paraformer-en-spk ([馃]() [猸怾() )                                                            |      璇煶璇嗗埆锛岄潪瀹炴椂      |  50000灏忔椂锛岃嫳鏂�  | 220M |
-|                  conformer-en ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃]() )                   |      璇煶璇嗗埆锛岄潪瀹炴椂      |  50000灏忔椂锛岃嫳鏂�  | 220M |
-|                  ct-punc ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃]() )                   |      鏍囩偣鎭㈠锛岄潪瀹炴椂      |  100M锛屼腑鏂囦笌鑻辨枃  | 1.1G | 
-|                       fsmn-vad ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃]() )                       |     璇煶绔偣妫�娴嬶紝瀹炴椂      | 5000灏忔椂锛屼腑鏂囦笌鑻辨枃 | 0.4M | 
-|                       fa-zh ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃]() )                        |   瀛楃骇鍒椂闂存埑棰勬祴         |  50000灏忔椂锛屼腑鏂�  | 38M  | 
+|                                                                              妯″瀷鍚嶅瓧                                                                               |        浠诲姟璇︽儏        |     璁粌鏁版嵁     | 鍙傛暟閲�  |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:|
+|     paraformer-zh ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃]() )     |  璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃�   |  60000灏忔椂锛屼腑鏂�  | 220M |
+|                 paraformer-zh-spk ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [馃]() )                 | 鍒嗚鑹茶闊宠瘑鍒紝甯︽椂闂存埑杈撳嚭锛岄潪瀹炴椂 |  60000灏忔椂锛屼腑鏂�  | 220M |
+|        paraformer-zh-online ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃]() )         |      璇煶璇嗗埆锛屽疄鏃�       |  60000灏忔椂锛屼腑鏂�  | 220M |
+|          paraformer-en ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃]() )          | 璇煶璇嗗埆锛岄潪瀹炴椂 |  50000灏忔椂锛岃嫳鏂�  | 220M |
+|                                                                paraformer-en-spk ([馃]() [猸怾() )                                                                |      璇煶璇嗗埆锛岄潪瀹炴椂      |  50000灏忔椂锛岃嫳鏂�  | 220M |
+|                      conformer-en ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃]() )                       |      璇煶璇嗗埆锛岄潪瀹炴椂      |  50000灏忔椂锛岃嫳鏂�  | 220M |
+|                      ct-punc ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃]() )                       |      鏍囩偣鎭㈠      |  100M锛屼腑鏂囦笌鑻辨枃  | 1.1G | 
+|                           fsmn-vad ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃]() )                           |     璇煶绔偣妫�娴嬶紝瀹炴椂      | 5000灏忔椂锛屼腑鏂囦笌鑻辨枃 | 0.4M | 
+|                           fa-zh ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃]() )                            |   瀛楃骇鍒椂闂存埑棰勬祴         |  50000灏忔椂锛屼腑鏂�  | 38M  |
 
 
 <a name="蹇�熷紑濮�"></a>
diff --git a/egs/aishell/transformer/utils/apply_cmvn.sh b/egs/aishell/transformer/utils/apply_cmvn.sh
index f8fd1d1..525c993 100755
--- a/egs/aishell/transformer/utils/apply_cmvn.sh
+++ b/egs/aishell/transformer/utils/apply_cmvn.sh
@@ -17,10 +17,17 @@
 dump_dir=${output_dir}/ark; mkdir -p ${dump_dir}
 mkdir -p ${logdir}
 
-$cmd JOB=1:$nj $logdir/apply_cmvn.JOB.log \
-    python utils/apply_cmvn.py -a $fbankdir/ark/feats.JOB.ark \
-        -c $cmvn_file -i JOB -o ${dump_dir} \
-        || exit 1;
+#$cmd JOB=1:$nj $logdir/apply_cmvn.JOB.log \
+#    python utils/apply_cmvn.py -a $fbankdir/ark/feats.JOB.ark \
+#        -c $cmvn_file -i JOB -o ${dump_dir} \
+#        || exit 1;
+
+for JOB in `seq 1 $nj`;do
+  {
+      python utils/apply_cmvn.py -a $fbankdir/ark/feats.${JOB}.ark \
+    -c $cmvn_file -i ${JOB} -o ${dump_dir} || exit 1;
+  } &> $logdir/apply_cmvn.${JOB}.log &
+done
 
 for n in $(seq $nj); do
     cat ${dump_dir}/feats.$n.scp || exit 1
diff --git a/funasr/__init__.py b/funasr/__init__.py
index 1f31505..aab4289 100644
--- a/funasr/__init__.py
+++ b/funasr/__init__.py
@@ -129,4 +129,7 @@
         
         return inference_pipeline(data_path_and_name_and_type, raw_inputs=raw_inputs, **kwargs)
     
-    return _infer_fn
\ No newline at end of file
+    return _infer_fn
+
+if __name__ == '__main__':
+    pass
\ No newline at end of file
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index 61af766..bda83ec 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -548,7 +548,10 @@
     init_param = modelscope_dict['init_model']
     cmvn_file = modelscope_dict['cmvn_file']
     seg_dict_file = modelscope_dict['seg_dict']
-    bpemodel = modelscope_dict['bpemodel']
+    if 'bpemodel' in modelscope_dict:
+        bpemodel = modelscope_dict['bpemodel']
+    else:
+        bpemodel = None
 
     # overwrite parameters
     with open(config) as f:
@@ -582,7 +585,7 @@
         args.seg_dict_file = seg_dict_file
     else:
         args.seg_dict_file = None
-    if os.path.exists(bpemodel):
+    if bpemodel is not None and os.path.exists(bpemodel):
         args.bpemodel = bpemodel
     else:
         args.bpemodel = None
diff --git a/funasr/quick_start.md b/funasr/quick_start.md
index 202c709..6108f02 100644
--- a/funasr/quick_start.md
+++ b/funasr/quick_start.md
@@ -26,7 +26,7 @@
 python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "5,10,5"
 ```
 
-For more examples, please refer to [docs](runtime/python/websocket/README.md).
+For more examples, please refer to [docs](../runtime/python/websocket/README.md).
 
 ### C++ version Example
 
@@ -47,7 +47,7 @@
 ```shell
 python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass
 ```
-For more examples, please refer to [docs](runtime/docs/SDK_tutorial_online_zh.md)
+For more examples, please refer to [docs](../runtime/docs/SDK_tutorial_online_zh.md)
 
 
 #### File Transcription Service, Mandarin (CPU)
@@ -68,7 +68,7 @@
 python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"
 ```
 
-For more examples, please refer to [docs](runtime/docs/SDK_tutorial_zh.md)
+For more examples, please refer to [docs](../runtime/docs/SDK_tutorial_zh.md)
 
 
 ## Industrial Model Egs
diff --git a/funasr/quick_start_zh.md b/funasr/quick_start_zh.md
index a8d20a2..9a3c2c9 100644
--- a/funasr/quick_start_zh.md
+++ b/funasr/quick_start_zh.md
@@ -26,7 +26,7 @@
 python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "5,10,5"
 #python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp"
 ```
-鏇村渚嬪瓙鍙互鍙傝�冿紙[鐐瑰嚮姝ゅ](runtime/python/websocket/README.md)锛�
+鏇村渚嬪瓙鍙互鍙傝�冿紙[鐐瑰嚮姝ゅ](../runtime/python/websocket/README.md)锛�
 
 <a name="cpp鐗堟湰绀轰緥"></a>
 #### c++鐗堟湰绀轰緥
@@ -46,7 +46,7 @@
 ```shell
 python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass
 ```
-鏇村渚嬪瓙鍙傝�冿紙[鐐瑰嚮姝ゅ](runtime/docs/SDK_tutorial_online_zh.md)锛�
+鏇村渚嬪瓙鍙傝�冿紙[鐐瑰嚮姝ゅ](../runtime/docs/SDK_tutorial_online_zh.md)锛�
 
 ##### 绂荤嚎鏂囦欢杞啓鏈嶅姟閮ㄧ讲
 ###### 鏈嶅姟绔儴缃�
@@ -59,7 +59,7 @@
 ```shell
 python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"
 ```
-鏇村渚嬪瓙鍙傝�冿紙[鐐瑰嚮姝ゅ](runtime/docs/SDK_tutorial_zh.md)锛�
+鏇村渚嬪瓙鍙傝�冿紙[鐐瑰嚮姝ゅ](../runtime/docs/SDK_tutorial_zh.md)锛�
 
 
 
diff --git a/funasr/version.txt b/funasr/version.txt
index ee94dd8..b60d719 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.8.3
+0.8.4
diff --git a/runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp b/runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp
index 5bf8f18..c55c888 100644
--- a/runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp
+++ b/runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp
@@ -279,7 +279,7 @@
     // hotword file
     unordered_map<string, int> hws_map;
     std::string nn_hotwords_ = "";
-    std::string hotword_path = model_path.at(HOTWORD);
+    std::string hotword_path = hotword.getValue();
     LOG(INFO) << "hotword path: " << hotword_path;
     funasr::ExtractHws(hotword_path, hws_map, nn_hotwords_);
 
diff --git a/runtime/onnxruntime/bin/funasr-onnx-2pass.cpp b/runtime/onnxruntime/bin/funasr-onnx-2pass.cpp
index d015499..5af0b41 100644
--- a/runtime/onnxruntime/bin/funasr-onnx-2pass.cpp
+++ b/runtime/onnxruntime/bin/funasr-onnx-2pass.cpp
@@ -113,7 +113,7 @@
     // hotword file
     unordered_map<string, int> hws_map;
     std::string nn_hotwords_ = "";
-    std::string hotword_path = model_path.at(HOTWORD);
+    std::string hotword_path = hotword.getValue();
     LOG(INFO) << "hotword path: " << hotword_path;
     funasr::ExtractHws(hotword_path, hws_map, nn_hotwords_);
 
diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp
index f16851f..dd6bb17 100644
--- a/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/runtime/onnxruntime/src/funasrruntime.cpp
@@ -463,7 +463,7 @@
 
 		funasr::AudioFrame* frame = NULL;
 		while(audio->FetchChunck(frame) > 0){
-			string msg = asr_online_handle->Forward(frame->data, frame->len, frame->is_final);
+			string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
 			if(mode == ASR_ONLINE){
 				((funasr::ParaformerOnline*)asr_online_handle)->online_res += msg;
 				if(frame->is_final){
@@ -494,7 +494,7 @@
 		// timestamp
 		std::string cur_stamp = "[";		
 		while(audio->FetchTpass(frame) > 0){
-			string msg = asr_handle->Forward(frame->data, frame->len, frame->is_final, hw_emb);
+			string msg = ((funasr::Paraformer*)asr_handle)->Forward(frame->data, frame->len, frame->is_final, hw_emb);
 
 			std::vector<std::string> msg_vec = funasr::split(msg, '|');  // split with timestamp
 			if(msg_vec.size()==0){
diff --git a/runtime/onnxruntime/src/paraformer-online.cpp b/runtime/onnxruntime/src/paraformer-online.cpp
index ed7a35a..3b629c5 100644
--- a/runtime/onnxruntime/src/paraformer-online.cpp
+++ b/runtime/onnxruntime/src/paraformer-online.cpp
@@ -469,7 +469,7 @@
     return result;
 }
 
-string ParaformerOnline::Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb)
+string ParaformerOnline::Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb, void* wfst_decoder)
 {
     std::vector<std::vector<float>> wav_feats;
     std::vector<float> waves(din, din+len);
diff --git a/runtime/onnxruntime/src/paraformer-online.h b/runtime/onnxruntime/src/paraformer-online.h
index 932785c..138c77c 100644
--- a/runtime/onnxruntime/src/paraformer-online.h
+++ b/runtime/onnxruntime/src/paraformer-online.h
@@ -109,7 +109,7 @@
         void AddOverlapChunk(std::vector<std::vector<float>> &wav_feats, bool input_finished);
         
         string ForwardChunk(std::vector<std::vector<float>> &wav_feats, bool input_finished);
-        string Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}});
+        string Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr);
         string Rescoring();
         // 2pass
         std::string online_res;
diff --git a/runtime/onnxruntime/src/tokenizer.cpp b/runtime/onnxruntime/src/tokenizer.cpp
index a111b91..f56601a 100644
--- a/runtime/onnxruntime/src/tokenizer.cpp
+++ b/runtime/onnxruntime/src/tokenizer.cpp
@@ -17,8 +17,12 @@
 
 CTokenizer::~CTokenizer()
 {
-	delete jieba_dict_trie_;
-    delete jieba_model_;
+	if (jieba_dict_trie_){
+		delete jieba_dict_trie_;
+	}
+	if (jieba_model_){
+    	delete jieba_model_;
+	}
 }
 
 void CTokenizer::SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm) {
diff --git a/runtime/onnxruntime/src/tokenizer.h b/runtime/onnxruntime/src/tokenizer.h
index 149161b..166061b 100644
--- a/runtime/onnxruntime/src/tokenizer.h
+++ b/runtime/onnxruntime/src/tokenizer.h
@@ -17,8 +17,8 @@
 	vector<string>   m_id2token,m_id2punc;
 	map<string, int>  m_token2id,m_punc2id;
 
-	cppjieba::DictTrie *jieba_dict_trie_;
-    cppjieba::HMMModel *jieba_model_;
+	cppjieba::DictTrie *jieba_dict_trie_=nullptr;
+    cppjieba::HMMModel *jieba_model_=nullptr;
 	cppjieba::Jieba jieba_processor_;
 
 public:
diff --git a/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py b/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
index 7b13654..c4c558e 100644
--- a/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
+++ b/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
@@ -14,7 +14,8 @@
 from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
                           OrtInferSession, TokenIDConverter, get_logger,
                           read_yaml)
-from .utils.postprocess_utils import sentence_postprocess
+from .utils.postprocess_utils import (sentence_postprocess,
+                                      sentence_postprocess_sentencepiece)
 from .utils.frontend import WavFrontend
 from .utils.timestamp_utils import time_stamp_lfr6_onnx
 from .utils.utils import pad_list, make_pad_mask
@@ -86,6 +87,10 @@
             self.pred_bias = config['model_conf']['predictor_bias']
         else:
             self.pred_bias = 0
+        if "lang" in config:
+            self.language = config['lang']
+        else:
+            self.language = None
 
     def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
         waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
@@ -111,7 +116,10 @@
                 preds = self.decode(am_scores, valid_token_lens)
                 if us_peaks is None:
                     for pred in preds:
-                        pred = sentence_postprocess(pred)
+                        if self.language == "en-bpe":
+                            pred = sentence_postprocess_sentencepiece(pred)
+                        else:
+                            pred = sentence_postprocess(pred)
                         asr_res.append({'preds': pred})
                 else:
                     for pred, us_peaks_ in zip(preds, us_peaks):
diff --git a/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py b/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
index c005fc9..14d6c76 100644
--- a/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
+++ b/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
@@ -240,3 +240,54 @@
                 real_word_lists.append(ch)
         sentence = ''.join(word_lists).strip()
         return sentence, real_word_lists
+
+def sentence_postprocess_sentencepiece(words):
+    middle_lists = []
+    word_lists = []
+    word_item = ''
+
+    # wash words lists
+    for i in words:
+        word = ''
+        if isinstance(i, str):
+            word = i
+        else:
+            word = i.decode('utf-8')
+
+        if word in ['<s>', '</s>', '<unk>', '<OOV>']:
+            continue
+        else:
+            middle_lists.append(word)
+
+    # all alpha characters
+    for i, ch in enumerate(middle_lists):
+        word = ''
+        if '\u2581' in ch and i == 0:
+            word_item = ''
+            word = ch.replace('\u2581', '')
+            word_item += word
+        elif '\u2581' in ch and i != 0:
+            word_lists.append(word_item)
+            word_lists.append(' ')
+            word_item = ''
+            word = ch.replace('\u2581', '')
+            word_item += word
+        else:
+            word_item += ch
+    if word_item is not None:
+        word_lists.append(word_item)
+    #word_lists = abbr_dispose(word_lists)
+    real_word_lists = []
+    for ch in word_lists:
+        if ch != ' ':
+            if ch == "i":
+                ch = ch.replace("i", "I")
+            elif ch == "i'm":
+                ch = ch.replace("i'm", "I'm")
+            elif ch == "i've":
+                ch = ch.replace("i've", "I've")
+            elif ch == "i'll":
+                ch = ch.replace("i'll", "I'll")
+            real_word_lists.append(ch)
+    sentence = ''.join(word_lists)
+    return sentence, real_word_lists
\ No newline at end of file
diff --git a/runtime/python/onnxruntime/setup.py b/runtime/python/onnxruntime/setup.py
index 88d4b3a..b8dc3e1 100644
--- a/runtime/python/onnxruntime/setup.py
+++ b/runtime/python/onnxruntime/setup.py
@@ -13,7 +13,7 @@
 
 
 MODULE_NAME = 'funasr_onnx'
-VERSION_NUM = '0.2.3'
+VERSION_NUM = '0.2.4'
 
 setuptools.setup(
     name=MODULE_NAME,

--
Gitblit v1.9.1