From 3aea0e27ec912e2f761d11c75e84ad6e5f1a2b29 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期三, 09 八月 2023 10:59:13 +0800
Subject: [PATCH] Merge branch 'main' of https://github.com/alibaba-damo-academy/FunASR into main
---
funasr/runtime/python/onnxruntime/setup.py | 2
Acknowledge | 1
docs/index.rst | 4 -
funasr/runtime/readme_cn.md | 2
README_zh.md | 4
funasr/runtime/readme.md | 2
funasr/runtime/docs/websocket_protocol.md | 88 +++++++++++++++++++++++++++++
README.md | 4
docs/images/XVERSE.png | 0
funasr/runtime/docs/SDK_tutorial_online.md | 2
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py | 7 +
funasr/runtime/docs/websocket_protocol_zh.md | 3
funasr/runtime/docs/SDK_tutorial_online_zh.md | 2
13 files changed, 106 insertions(+), 15 deletions(-)
diff --git a/Acknowledge b/Acknowledge
index 92d6e5e..b1e4e56 100644
--- a/Acknowledge
+++ b/Acknowledge
@@ -6,3 +6,4 @@
4. We acknowledge [ChinaTelecom](https://github.com/zhuzizyf/damo-fsmn-vad-infer-httpserver) for contributing the VAD runtime.
5. We acknowledge [RapidAI](https://github.com/RapidAI) for contributing the Paraformer and CT_Transformer-punc runtime.
6. We acknowledge [AiHealthx](http://www.aihealthx.com/) for contributing the websocket service and html5.
+7. We acknowledge [XVERSE](http://www.xverse.cn/index.html) for contributing the grpc service.
diff --git a/README.md b/README.md
index 5b62d3d..ebac5a0 100644
--- a/README.md
+++ b/README.md
@@ -63,8 +63,8 @@
## Contributors
-| <div align="left"><img src="docs/images/damo.png" width="180"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div> | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> |
-|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|
+| <div align="left"><img src="docs/images/damo.png" width="180"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div> | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
+|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
The contributors can be found in [contributors list]((./Acknowledge))
diff --git a/README_zh.md b/README_zh.md
index c86d3dc..c1ecfd3 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -60,8 +60,8 @@
## 绀惧尯璐$尞鑰�
-| <div align="left"><img src="docs/images/damo.png" width="180"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div> | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> |
-|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|
+| <div align="left"><img src="docs/images/damo.png" width="180"/> | <div align="left"><img src="docs/images/nwpu.png" width="260"/> | <img src="docs/images/China_Telecom.png" width="200"/> </div> | <img src="docs/images/RapidAI.png" width="200"/> </div> | <img src="docs/images/aihealthx.png" width="200"/> </div> | <img src="docs/images/XVERSE.png" width="250"/> </div> |
+|:---------------------------------------------------------------:|:---------------------------------------------------------------:|:--------------------------------------------------------------:|:-------------------------------------------------------:|:-----------------------------------------------------------:|:------------------------------------------------------:|
璐$尞鑰呭悕鍗曡鍙傝�冿紙[鑷磋阿鍚嶅崟](./Acknowledge)锛�
diff --git a/docs/images/XVERSE.png b/docs/images/XVERSE.png
new file mode 100644
index 0000000..12e6e49
--- /dev/null
+++ b/docs/images/XVERSE.png
Binary files differ
diff --git a/docs/index.rst b/docs/index.rst
index eb3016b..b79aee0 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -71,11 +71,9 @@
:maxdepth: 1
:caption: Runtime and Service
-
+ ./funasr/runtime/readme.md
./funasr/runtime/docs/SDK_tutorial_online.md
./funasr/runtime/docs/SDK_tutorial.md
- ./funasr/runtime/python/websocket/README.md
- ./funasr/runtime/websocket/readme.md
./funasr/runtime/html5/readme.md
diff --git a/funasr/runtime/docs/SDK_tutorial_online.md b/funasr/runtime/docs/SDK_tutorial_online.md
index b4800bb..bc02176 100644
--- a/funasr/runtime/docs/SDK_tutorial_online.md
+++ b/funasr/runtime/docs/SDK_tutorial_online.md
@@ -29,7 +29,7 @@
# curl -O https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/shell/funasr-runtime-deploy-online-cpu-en.sh;
```
-Execute the deployment tool and press the Enter key at the prompt to complete the installation and deployment of the server. Currently, the convenient deployment tool only supports Linux environments. For other environments, please refer to the development guide ([docs](./SDK_advanced_guide_offline.md)).
+Execute the deployment tool and press the Enter key at the prompt to complete the installation and deployment of the server. Currently, the convenient deployment tool only supports Linux environments. For other environments, please refer to the development guide ([docs](./SDK_advanced_guide_online.md)).
```shell
sudo bash funasr-runtime-deploy-online-cpu-zh.sh install --workspace ./funasr-runtime-resources
```
diff --git a/funasr/runtime/docs/SDK_tutorial_online_zh.md b/funasr/runtime/docs/SDK_tutorial_online_zh.md
index 89ba6f3..159b0d7 100644
--- a/funasr/runtime/docs/SDK_tutorial_online_zh.md
+++ b/funasr/runtime/docs/SDK_tutorial_online_zh.md
@@ -30,7 +30,7 @@
# curl -O https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/shell/funasr-runtime-deploy-online-cpu-zh.sh;
```
-鎵ц閮ㄧ讲宸ュ叿锛屽湪鎻愮ず澶勮緭鍏ュ洖杞﹂敭鍗冲彲瀹屾垚鏈嶅姟绔畨瑁呬笌閮ㄧ讲銆傜洰鍓嶄究鎹烽儴缃插伐鍏锋殏鏃朵粎鏀寔Linux鐜锛屽叾浠栫幆澧冮儴缃插弬鑰冨紑鍙戞寚鍗楋紙[鐐瑰嚮姝ゅ](#瀹㈡埛绔敤娉曡瑙�)锛�
+鎵ц閮ㄧ讲宸ュ叿锛屽湪鎻愮ず澶勮緭鍏ュ洖杞﹂敭鍗冲彲瀹屾垚鏈嶅姟绔畨瑁呬笌閮ㄧ讲銆傜洰鍓嶄究鎹烽儴缃插伐鍏锋殏鏃朵粎鏀寔Linux鐜锛屽叾浠栫幆澧冮儴缃插弬鑰冨紑鍙戞寚鍗楋紙[鐐瑰嚮姝ゅ](./SDK_advanced_guide_online_zh.md)锛�
```shell
sudo bash funasr-runtime-deploy-online-cpu-zh.sh install --workspace ./funasr-runtime-resources
```
diff --git a/funasr/runtime/docs/websocket_protocol.md b/funasr/runtime/docs/websocket_protocol.md
new file mode 100644
index 0000000..94823a2
--- /dev/null
+++ b/funasr/runtime/docs/websocket_protocol.md
@@ -0,0 +1,88 @@
+([绠�浣撲腑鏂嘳(./websocket_protocol_zh.md)|English)
+
+# WebSocket/gRPC Communication Protocol
+## Offline File Transcription
+### Sending Data from Client to Server
+#### Message Format
+Configuration parameters and meta information are in JSON format, while audio data is in bytes.
+#### Initial Communication
+The message (which needs to be serialized in JSON) is:
+```text
+{"mode": "offline", "wav_name": "wav_name", "is_speaking": True,"wav_format":"pcm"}
+```
+Parameter explanation:
+```text
+`mode`: `offline`, indicating the inference mode for offline file transcription
+`wav_name`: the name of the audio file to be transcribed
+`wav_format`: the audio and video file extension, such as pcm, mp3, mp4, etc.
+`is_speaking`: False indicates the end of a sentence, such as a VAD segmentation point or the end of a WAV file
+`audio_fs`: when the input audio is in PCM format, the audio sampling rate parameter needs to be added
+```
+
+#### Sending Audio Data
+For PCM format, directly send the audio data. For other audio formats, send the header information and audio and video bytes data together. Multiple sampling rates and audio and video formats are supported.
+
+#### Sending End of Audio Flag
+After sending the audio data, an end-of-audio flag needs to be sent (which needs to be serialized in JSON):
+```text
+{"is_speaking": False}
+```
+
+### Sending Data from Server to Client
+#### Sending Recognition Results
+The message (serialized in JSON) is:
+```text
+{"mode": "offline", "wav_name": "wav_name", "text": "asr ouputs", "is_final": True}
+```
+Parameter explanation:
+```text
+`mode`: `offline`, indicating the inference mode for offline file transcription
+`wav_name`: the name of the audio file to be transcribed
+`text`: the text output of speech recognition
+`is_final`: indicating the end of recognition
+```
+
+## Real-time Speech Recognition
+### System Architecture Diagram
+
+<div align="left"><img src="images/2pass.jpg" width="400"/></div>
+
+### Sending Data from Client to Server
+#### Message Format
+Configuration parameters and meta information are in JSON format, while audio data is in bytes.
+
+#### Initial Communication
+The message (which needs to be serialized in JSON) is:
+```text
+{"mode": "2pass", "wav_name": "wav_name", "is_speaking": True, "wav_format":"pcm", "chunk_size":[5,10,5]
+```
+Parameter explanation:
+```text
+`mode`: `offline` indicates the inference mode for single-sentence recognition; `online` indicates the inference mode for real-time speech recognition; `2pass` indicates real-time speech recognition and offline model correction for sentence endings.
+`wav_name`: the name of the audio file to be transcribed
+`wav_format`: the audio and video file extension, such as pcm, mp3, mp4, etc. (Note: only PCM audio streams are supported in version 1.0)
+`is_speaking`: False indicates the end of a sentence, such as a VAD segmentation point or the end of a WAV file
+`chunk_size`: indicates the latency configuration of the streaming model, `[5,10,5]` indicates that the current audio is 600ms long, with a 300ms look-ahead and look-back time.
+`audio_fs`: when the input audio is in PCM format, the audio sampling rate parameter needs to be added
+```
+#### Sending Audio Data
+Directly send the audio data, removing the header information and sending only the bytes data. Supported audio sampling rates are 8000 (which needs to be specified as audio_fs in message), and 16000.
+#### Sending End of Audio Flag
+After sending the audio data, an end-of-audio flag needs to be sent (which needs to be serialized in JSON):
+```text
+{"is_speaking": False}
+```
+### Sending Data from Server to Client
+#### Sending Recognition Results
+The message (serialized in JSON) is:
+
+```text
+{"mode": "2pass-online", "wav_name": "wav_name", "text": "asr ouputs", "is_final": True}
+```
+Parameter explanation:
+```text
+`mode`: indicates the inference mode, divided into `2pass-online` for real-time recognition results and `2pass-offline` for 2-pass corrected recognition results.
+`wav_name`: the name of the audio file to be transcribed
+`text`: the text output of speech recognition
+`is_final`: indicating the end of recognition
+```
\ No newline at end of file
diff --git a/funasr/runtime/docs/websocket_protocol_zh.md b/funasr/runtime/docs/websocket_protocol_zh.md
index 4b07a45..38ab3c4 100644
--- a/funasr/runtime/docs/websocket_protocol_zh.md
+++ b/funasr/runtime/docs/websocket_protocol_zh.md
@@ -1,3 +1,4 @@
+(绠�浣撲腑鏂噟[English](./websocket_protocol.md))
# websocket/grpc閫氫俊鍗忚
## 绂荤嚎鏂囦欢杞啓
### 浠庡鎴风寰�鏈嶅姟绔彂閫佹暟鎹�
@@ -64,7 +65,7 @@
`audio_fs`锛氬綋杈撳叆闊抽涓簆cm鏁版嵁鏄紝闇�瑕佸姞涓婇煶棰戦噰鏍风巼鍙傛暟
```
#### 鍙戦�侀煶棰戞暟鎹�
-鐩存帴灏嗛煶棰戞暟鎹紝绉婚櫎澶撮儴淇℃伅鍚庣殑bytes鏁版嵁鍙戦�侊紝鏀寔闊抽閲囨牱鐜囦负80000锛�16000
+鐩存帴灏嗛煶棰戞暟鎹紝绉婚櫎澶撮儴淇℃伅鍚庣殑bytes鏁版嵁鍙戦�侊紝鏀寔闊抽閲囨牱鐜囦负8000锛坄message`涓渶瑕佹寚瀹歚audio_fs`涓�8000锛夛紝16000
#### 鍙戦�佺粨鏉熸爣蹇�
闊抽鏁版嵁鍙戦�佺粨鏉熷悗锛岄渶瑕佸彂閫佺粨鏉熸爣蹇楋紙闇�瑕佺敤json搴忓垪鍖栵級锛�
```text
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
index 170126d..f1fc9a0 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
@@ -9,8 +9,11 @@
import re
import numpy as np
import yaml
-from onnxruntime import (GraphOptimizationLevel, InferenceSession,
- SessionOptions, get_available_providers, get_device)
+try:
+ from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+ SessionOptions, get_available_providers, get_device)
+except:
+ print("please pip3 install onnxruntime")
import jieba
import warnings
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index 6cc1989..8f71815 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -13,7 +13,7 @@
MODULE_NAME = 'funasr_onnx'
-VERSION_NUM = '0.1.2'
+VERSION_NUM = '0.2.0'
setuptools.setup(
name=MODULE_NAME,
diff --git a/funasr/runtime/readme.md b/funasr/runtime/readme.md
index 7980029..f330f0e 100644
--- a/funasr/runtime/readme.md
+++ b/funasr/runtime/readme.md
@@ -1,4 +1,4 @@
-# FunASR runtime-SDK
+# FunASR Runtime Roadmap
涓枃鏂囨。锛圼鐐瑰嚮姝ゅ](./readme_cn.md)锛�
FunASR is a speech recognition framework developed by the Speech Lab of DAMO Academy, which integrates industrial-level models in the fields of speech endpoint detection, speech recognition, punctuation segmentation, and more.
diff --git a/funasr/runtime/readme_cn.md b/funasr/runtime/readme_cn.md
index c91dae7..8ee29cd 100644
--- a/funasr/runtime/readme_cn.md
+++ b/funasr/runtime/readme_cn.md
@@ -1,4 +1,4 @@
-# FunASR runtime-SDK
+# FunASR杞欢鍖呰矾绾垮浘
English Version锛圼docs](./readme.md)锛�
--
Gitblit v1.9.1