python/FunASR-XL.git

parent: 74aa12ee | 补丁 | 提交 | show whitespace

Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add

游雁

2023-09-18 2bd82419482b9a074c8c464d5b65f997632964d3

Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add

9个文件已修改

3个文件已添加

	docs/model_zoo/modelscope_models.md	9 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	docs/model_zoo/modelscope_models_zh.md	9 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_infer.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_launch.py	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/preprocessor.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/http/README.md	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/http/asr_nginx.conf	44 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/http/client.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/http/server.py	32 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/http/start_server.sh	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/websocket/README.md	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/websocket/funasr_client_api.py	134 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 docs/model_zoo/modelscope_models.md

@@ -78,10 +78,11 @@

### Punctuation Restoration

|                                                         Model Name                                                         |        Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------|
|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    Offline     |   offline punctuation model    |
| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     Online     |  online punctuation model     |
|                                                         Model Name                                                         |        Language | Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
|:--------------------------------------------------------------------------------------------------------------------------:|:---------|:----------------------------:|:----------:|:----------:|:--------------:|:------|
|      [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary)     | CN & EN | Alibaba Text Data(100M) |    1.1G     |    471067     |    Offline     |   large offline punctuation model    |
|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)    | CN & EN  | Alibaba Text Data(70M) |    291M     |    272727     |    Offline     |   offline punctuation model    |
| [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)    | CN & EN  | Alibaba Text Data(70M) |    288M     |    272727     |     Online     |  online punctuation model     |

### Language Models


 docs/model_zoo/modelscope_models_zh.md

@@ -83,10 +83,11 @@

### 标点恢复模型

|                                                         模型名字                                                         |        训练数据         | 模型参数 | Vocab Size| 非实时/实时 | 备注      |
|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:--------|
|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    非实时     | 支持中英文标点 |
| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     实时     | VAD点实时  |
|                                                         模型名字                                                        | 语言  |        训练数据         | 模型参数 | Vocab Size| 非实时/实时 | 备注      |
|:--------------------------------------------------------------------------------------------------------------------------:|:----------:|:----------------------------:|:----------:|:----------:|:--------------:|:--------|
|      [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary)     | 中文和英文 | Alibaba Text Data(100M) |    1.1G     |    471067     |    非实时     | 支持中英文标点大模型 |
|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)     | 中文和英文 | Alibaba Text Data(70M) |    291M     |    272727     |    非实时     | 支持中英文标点 |
| [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | 中文和英文 | Alibaba Text Data(70M) |    288M     |    272727     |     实时     | VAD点实时标点  |

### 语音模型


 funasr/bin/asr_infer.py

@@ -1337,7 +1337,7 @@
            quantize_dtype: str = "qint8",
            nbest: int = 1,
            streaming: bool = False,
            simu_streaming: bool = False,
            fake_streaming: bool = False,
            full_utt: bool = False,
            chunk_size: int = 16,
            left_context: int = 32,
@@ -1432,7 +1432,7 @@

        self.beam_search = beam_search
        self.streaming = streaming
        self.simu_streaming = simu_streaming
        self.fake_streaming = fake_streaming
        self.full_utt = full_utt
        self.chunk_size = max(chunk_size, 0)
        self.left_context = left_context
@@ -1442,8 +1442,8 @@
            self.streaming = False
            self.asr_model.encoder.dynamic_chunk_training = False

        if not simu_streaming or chunk_size == 0:
            self.simu_streaming = False
        if not fake_streaming or chunk_size == 0:
            self.fake_streaming = False
            self.asr_model.encoder.dynamic_chunk_training = False

        self.frontend = frontend
@@ -1520,7 +1520,7 @@
        return nbest_hyps

    @torch.no_grad()
    def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]:
    def fake_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]:
        """Speech2Text call.
        Args:
            speech: Speech data. (S)

 funasr/bin/asr_inference_launch.py

@@ -427,7 +427,7 @@
                        else:
                            text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
                        item = {'key': key, 'value': text_postprocessed}
                        if timestamp_postprocessed != "" or len(timestamp) == 0:
                        if timestamp_postprocessed != "":
                            item['timestamp'] = timestamp_postprocessed
                        asr_result_list.append(item)
                        finish_count += 1
@@ -719,7 +719,7 @@
            item = {'key': key, 'value': text_postprocessed_punc}
            if text_postprocessed != "":
                item['text_postprocessed'] = text_postprocessed
            if time_stamp_postprocessed != "" or len(time_stamp) == 0:
            if time_stamp_postprocessed != "":
                item['time_stamp'] = time_stamp_postprocessed

            item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
@@ -1297,7 +1297,7 @@
        quantize_modules: Optional[List[str]] = None,
        quantize_dtype: Optional[str] = "float16",
        streaming: Optional[bool] = False,
        simu_streaming: Optional[bool] = False,
        fake_streaming: Optional[bool] = False,
        full_utt: Optional[bool] = False,
        chunk_size: Optional[int] = 16,
        left_context: Optional[int] = 16,
@@ -1374,7 +1374,7 @@
        quantize_modules=quantize_modules,
        quantize_dtype=quantize_dtype,
        streaming=streaming,
        simu_streaming=simu_streaming,
        fake_streaming=fake_streaming,
        full_utt=full_utt,
        chunk_size=chunk_size,
        left_context=left_context,
@@ -1432,8 +1432,8 @@
                    final_hyps = speech2text.streaming_decode(
                        speech[_end: len(speech)], is_final=True
                    )
                elif speech2text.simu_streaming:
                    final_hyps = speech2text.simu_streaming_decode(**batch)
                elif speech2text.fake_streaming:
                    final_hyps = speech2text.fake_streaming_decode(**batch)
                elif speech2text.full_utt:
                    final_hyps = speech2text.full_utt_decode(**batch)
                else:
@@ -1823,7 +1823,7 @@
    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
    group.add_argument("--streaming", type=str2bool, default=False)
    group.add_argument("--simu_streaming", type=str2bool, default=False)
    group.add_argument("--fake_streaming", type=str2bool, default=False)
    group.add_argument("--full_utt", type=str2bool, default=False)
    group.add_argument("--chunk_size", type=int, default=16)
    group.add_argument("--left_context", type=int, default=16)

 funasr/datasets/preprocessor.py

@@ -201,7 +201,7 @@
        self.seg_dict = None
        if seg_dict_file is not None:
            self.seg_dict = {}
            with open(seg_dict_file) as f:
            with open(seg_dict_file, "r", encoding="utf8") as f:
                lines = f.readlines()
            for line in lines:
                s = line.strip().split()

 funasr/runtime/python/http/README.md

@@ -21,9 +21,11 @@
--host [host ip] \
--port [server port] \
--asr_model [asr model_name] \
--vad_model [vad model_name] \
--punc_model [punc model_name] \
--ngpu [0 or 1] \
--ncpu [1 or 4] \
--hotword_path [path of hot word txt] \
--certfile [path of certfile for ssl] \
--keyfile [path of keyfile for ssl] \
--temp_dir [upload file temp dir] 
@@ -45,3 +47,22 @@
--add_pun [add pun to result] \
--audio_path [use audio path] 
```


## 支持多进程

方法是启动多个`server.py`，然后通过Nginx的负载均衡分发请求，达到支持多用户同时连效果，处理方式如下，默认您已经安装了Nginx，没安装的请参考[官方安装教程](https://nginx.org/en/linux_packages.html#Ubuntu)。

配置Nginx。
```shell
sudo cp -f asr_nginx.conf /etc/nginx/nginx.conf
sudo service nginx reload
```

然后使用脚本启动多个服务，每个服务的端口号不一样。
```shell
sudo chmod +x start_server.sh
./start_server.sh
```

**说明：** 默认是3个进程，如果需要修改，首先修改`start_server.sh`的最后那部分，可以添加启动数量。然后修改`asr_nginx.conf`配置文件的`upstream backend`部分，增加新启动的服务，可以使其他服务器的服务。

 funasr/runtime/python/http/asr_nginx.conf

New file
@@ -0,0 +1,44 @@
user  nginx;
worker_processes  auto;

error_log  /var/log/nginx/error.log notice;
pid        /var/run/nginx.pid;

events {
    worker_connections  1024;
}


http {
    include       /etc/nginx/mime.types;
    default_type  application/octet-stream;

    log_format  main  '$remote_addr - $remote_user [$time_local] "$request" '
                      '$status $body_bytes_sent "$http_referer" '
                      '"$http_user_agent" "$http_x_forwarded_for"';

    access_log  /var/log/nginx/access.log  main;

    sendfile        on;
    keepalive_timeout  65;

    upstream backend {
        # 最少连接算法
        least_conn;
        # 启动的服务地址
        server localhost:8001;
        server localhost:8002;
        server localhost:8003;
    }

    server {
        # 实际访问的端口
        listen 8000;

        location / {
            proxy_pass http://backend;
        }
    }

    include /etc/nginx/conf.d/*.conf;
}

 funasr/runtime/python/http/client.py

@@ -1,3 +1,5 @@
import os

import requests
import argparse

@@ -23,12 +25,16 @@
                    required=False,
                    help="use audio path")
args = parser.parse_args()
print("-----------  Configuration Arguments -----------")
for arg, value in vars(args).items():
    print("%s: %s" % (arg, value))
print("------------------------------------------------")


url = f'http://{args.host}:{args.port}/recognition'
data = {'add_pun': args.add_pun}
headers = {}
files = [('audio', ('file', open(args.audio_path, 'rb'), 'application/octet-stream'))]
files = [('audio', (os.path.basename(args.audio_path), open(args.audio_path, 'rb'), 'application/octet-stream'))]

response = requests.post(url, headers=headers, data=data, files=files)
print(response.text)

 funasr/runtime/python/http/server.py

@@ -1,8 +1,7 @@
import argparse
import logging
import os
import random
import time
import uuid

import aiofiles
import ffmpeg
@@ -29,11 +28,15 @@
parser.add_argument("--asr_model",
                    type=str,
                    default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                    help="model from modelscope")
                    help="offline asr model from modelscope")
parser.add_argument("--vad_model",
                    type=str,
                    default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
                    help="vad model from modelscope")
parser.add_argument("--punc_model",
                    type=str,
                    default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
                    help="model from modelscope")
                    default="damo/punc_ct-transformer_cn-en-common-vocab471067-large",
                    help="punc model from modelscope")
parser.add_argument("--ngpu",
                    type=int,
                    default=1,
@@ -42,6 +45,10 @@
                    type=int,
                    default=4,
                    help="cpu cores")
parser.add_argument("--hotword_path",
                    type=str,
                    default=None,
                    help="hot word txt path, only the hot word model works")
parser.add_argument("--certfile",
                    type=str,
                    default=None,
@@ -58,22 +65,30 @@
                    required=False,
                    help="temp dir")
args = parser.parse_args()
print("-----------  Configuration Arguments -----------")
for arg, value in vars(args).items():
    print("%s: %s" % (arg, value))
print("------------------------------------------------")


os.makedirs(args.temp_dir, exist_ok=True)

print("model loading")
param_dict = {}
if args.hotword_path is not None and os.path.exists(args.hotword_path):
    param_dict['hotword'] = args.hotword_path
# asr
inference_pipeline_asr = pipeline(task=Tasks.auto_speech_recognition,
                                  model=args.asr_model,
                                  vad_model=args.vad_model,
                                  ngpu=args.ngpu,
                                  ncpu=args.ncpu,
                                  model_revision=None)
                                  param_dict=param_dict)
print(f'loaded asr models.')

if args.punc_model != "":
    inference_pipeline_punc = pipeline(task=Tasks.punctuation,
                                       model=args.punc_model,
                                       model_revision="v1.0.2",
                                       ngpu=args.ngpu,
                                       ncpu=args.ncpu)
    print(f'loaded pun models.')
@@ -87,7 +102,7 @@
async def api_recognition(audio: UploadFile = File(..., description="audio file"),
                          add_pun: int = Body(1, description="add punctuation", embed=True)):
    suffix = audio.filename.split('.')[-1]
    audio_path = f'{args.temp_dir}/{int(time.time() * 1000)}_{random.randint(100, 999)}.{suffix}'
    audio_path = f'{args.temp_dir}/{str(uuid.uuid1())}.{suffix}'
    async with aiofiles.open(audio_path, 'wb') as out_file:
        content = await audio.read()
        await out_file.write(content)
@@ -100,6 +115,7 @@
    if add_pun:
        rec_result = inference_pipeline_punc(text_in=rec_result['text'], param_dict={'cache': list()})
    ret = {"results": rec_result['text'], "code": 0}
    print(ret)
    return ret



 funasr/runtime/python/http/start_server.sh

New file
@@ -0,0 +1,21 @@
#!/bin/bash

# 创建日志文件夹
if [ ! -d "log/" ];then
  mkdir log
fi

# kill掉之前的进程
server_id=`ps -ef | grep server.py | grep -v "grep" | awk '{print $2}'`
echo $server_id

for id in $server_id
do
    kill -9 $id
    echo "killed $id"
done

# 启动多个服务，可以设置使用不同的显卡
CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8001 >> log/output1.log 2>&1 &
CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8002 >> log/output2.log 2>&1 &
CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8003 >> log/output3.log 2>&1 &

 funasr/runtime/python/websocket/README.md

@@ -107,6 +107,20 @@
# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
```

#### Websocket api
```shell
    # class Funasr_websocket_recognizer example with 3 step
    # 1.create an recognizer 
    rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")
    # 2.send pcm data to asr engine and get asr result
    text=rcg.feed_chunk(data)
    print("text",text)
    # 3.get last result, set timeout=3
    text=rcg.close(timeout=3)
    print("text",text)
```

## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service.

 funasr/runtime/python/websocket/funasr_client_api.py

New file
@@ -0,0 +1,134 @@
'''

  Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights

  Reserved. MIT License  (https://opensource.org/licenses/MIT)

  
  2022-2023 by zhaomingwork@qq.com  
'''

# pip install websocket-client

import  ssl

from websocket import ABNF

from websocket import create_connection

from queue import Queue

import threading

import traceback

import json

import time

import numpy as np

# class for recognizer in websocket

class Funasr_websocket_recognizer():

    '''

    python asr recognizer lib



    '''

    def __init__(self, host="127.0.0.1", port="30035", is_ssl=True,chunk_size="5, 10, 5",chunk_interval=10,mode="offline",wav_name="default"):

      '''

          host: server host ip

          port: server port

          is_ssl: True for wss protocal, False for ws

      '''

      try:

        if is_ssl == True:

            ssl_context = ssl.SSLContext()

            ssl_context.check_hostname = False

            ssl_context.verify_mode = ssl.CERT_NONE

            uri = "wss://{}:{}".format(host, port)

            ssl_opt={"cert_reqs": ssl.CERT_NONE}

        else:

            uri = "ws://{}:{}".format(host, port)

            ssl_context = None

            ssl_opt=None

        self.host = host

        self.port = port

 
        self.msg_queue = Queue() # used for recognized result text



        print("connect to url",uri)

        self.websocket=create_connection(uri,ssl=ssl_context,sslopt=ssl_opt)

 
        self.thread_msg = threading.Thread(target=Funasr_websocket_recognizer.thread_rec_msg,args=(self,))

        self.thread_msg.start()

        chunk_size = [int(x) for x in  chunk_size.split(",")]

        stride = int(60 *  chunk_size[1]/  chunk_interval / 1000 * 16000 * 2)

        chunk_num = (len(audio_bytes) - 1) // stride + 1

       
        message = json.dumps({"mode":  mode, "chunk_size":  chunk_size, "chunk_interval":  chunk_interval,

                              "wav_name": wav_name, "is_speaking": True})

 
        self.websocket.send(message)

 
        print("send json",message)

      
      except Exception as e:

            print("Exception:", e)

            traceback.print_exc()

    
    # threads for rev msg

    def thread_rec_msg(self):

        try:

         while(True):

           msg=self.websocket.recv()

           if msg is None or len(msg)==0:

             continue

           msg = json.loads(msg)

           
           self.msg_queue.put(msg)

        except Exception as e:

            print("client closed")

 
    # feed data to asr engine, wait_time means waiting for result until time out

    def feed_chunk(self, chunk,wait_time=0.01):

        try:

            self.websocket.send(chunk,  ABNF.OPCODE_BINARY)

            # loop to check if there is a message, timeout in 0.01s

            while(True):

               msg = self.msg_queue.get(timeout=wait_time)

               if self.msg_queue.empty():

                  break

                  
            return msg

        except:

            return ""

    def close(self,timeout=1):

        message = json.dumps({"is_speaking": False})

        self.websocket.send(message)

        # sleep for timeout seconds to wait for result

        time.sleep(timeout)

        msg=""

        while(not self.msg_queue.empty()):

            msg = self.msg_queue.get()

        
        self.websocket.close()

        # only resturn the last msg

        return msg

        
if __name__ == '__main__':

    print('example for Funasr_websocket_recognizer') 
    import wave

    wav_path="asr_example.wav"

    with wave.open(wav_path, "rb") as wav_file:

                params = wav_file.getparams()

                frames = wav_file.readframes(wav_file.getnframes())

                audio_bytes = bytes(frames)

    
 
    stride = int(60 * 10 / 10 / 1000 * 16000 * 2)

    chunk_num = (len(audio_bytes) - 1) // stride + 1

    # create an recognizer 
    rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")

    # loop to send chunk

    for i in range(chunk_num):



            beg = i * stride

            data = audio_bytes[beg:beg + stride]

 
            text=rcg.feed_chunk(data,wait_time=0.02)

            if len(text)>0:

               print("text",text)

            time.sleep(0.05)

 
    # get last message

    text=rcg.close(timeout=3)

    print("text",text)

			@@ -78,10 +78,11 @@

			### Punctuation Restoration

			\| Model Name \| Training Data \| Parameters \| Vocab Size\| Offline/Online \| Notes \|
			\|:--------------------------------------------------------------------------------------------------------------------------:\|:----------------------------:\|:----------:\|:----------:\|:--------------:\|:------\|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) \| Alibaba Text Data \| 70M \| 272727 \| Offline \| offline punctuation model \|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) \| Alibaba Text Data \| 70M \| 272727 \| Online \| online punctuation model \|
			\| Model Name \| Language \| Training Data \| Parameters \| Vocab Size\| Offline/Online \| Notes \|
			\|:--------------------------------------------------------------------------------------------------------------------------:\|:---------\|:----------------------------:\|:----------:\|:----------:\|:--------------:\|:------\|
			\| [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) \| CN & EN \| Alibaba Text Data(100M) \| 1.1G \| 471067 \| Offline \| large offline punctuation model \|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) \| CN & EN \| Alibaba Text Data(70M) \| 291M \| 272727 \| Offline \| offline punctuation model \|
			\| [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) \| CN & EN \| Alibaba Text Data(70M) \| 288M \| 272727 \| Online \| online punctuation model \|

			### Language Models

			@@ -83,10 +83,11 @@

			### 标点恢复模型

			\| 模型名字 \| 训练数据 \| 模型参数 \| Vocab Size\| 非实时/实时 \| 备注 \|
			\|:--------------------------------------------------------------------------------------------------------------------------:\|:----------------------------:\|:----------:\|:----------:\|:--------------:\|:--------\|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) \| Alibaba Text Data \| 70M \| 272727 \| 非实时 \| 支持中英文标点 \|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) \| Alibaba Text Data \| 70M \| 272727 \| 实时 \| VAD点实时 \|
			\| 模型名字 \| 语言 \| 训练数据 \| 模型参数 \| Vocab Size\| 非实时/实时 \| 备注 \|
			\|:--------------------------------------------------------------------------------------------------------------------------:\|:----------:\|:----------------------------:\|:----------:\|:----------:\|:--------------:\|:--------\|
			\| [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) \| 中文和英文 \| Alibaba Text Data(100M) \| 1.1G \| 471067 \| 非实时 \| 支持中英文标点大模型 \|
			\| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) \| 中文和英文 \| Alibaba Text Data(70M) \| 291M \| 272727 \| 非实时 \| 支持中英文标点 \|
			\| [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) \| 中文和英文 \| Alibaba Text Data(70M) \| 288M \| 272727 \| 实时 \| VAD点实时标点 \|

			### 语音模型

			@@ -1337,7 +1337,7 @@
			quantize_dtype: str = "qint8",
			nbest: int = 1,
			streaming: bool = False,
			simu_streaming: bool = False,
			fake_streaming: bool = False,
			full_utt: bool = False,
			chunk_size: int = 16,
			left_context: int = 32,
			@@ -1432,7 +1432,7 @@

			self.beam_search = beam_search
			self.streaming = streaming
			self.simu_streaming = simu_streaming
			self.fake_streaming = fake_streaming
			self.full_utt = full_utt
			self.chunk_size = max(chunk_size, 0)
			self.left_context = left_context
			@@ -1442,8 +1442,8 @@
			self.streaming = False
			self.asr_model.encoder.dynamic_chunk_training = False

			if not simu_streaming or chunk_size == 0:
			self.simu_streaming = False
			if not fake_streaming or chunk_size == 0:
			self.fake_streaming = False
			self.asr_model.encoder.dynamic_chunk_training = False

			self.frontend = frontend
			@@ -1520,7 +1520,7 @@
			return nbest_hyps

			@torch.no_grad()
			def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]:
			def fake_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]:
			"""Speech2Text call.
			Args:
			speech: Speech data. (S)

			@@ -427,7 +427,7 @@
			else:
			text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
			item = {'key': key, 'value': text_postprocessed}
			if timestamp_postprocessed != "" or len(timestamp) == 0:
			if timestamp_postprocessed != "":
			item['timestamp'] = timestamp_postprocessed
			asr_result_list.append(item)
			finish_count += 1
			@@ -719,7 +719,7 @@
			item = {'key': key, 'value': text_postprocessed_punc}
			if text_postprocessed != "":
			item['text_postprocessed'] = text_postprocessed
			if time_stamp_postprocessed != "" or len(time_stamp) == 0:
			if time_stamp_postprocessed != "":
			item['time_stamp'] = time_stamp_postprocessed

			item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
			@@ -1297,7 +1297,7 @@
			quantize_modules: Optional[List[str]] = None,
			quantize_dtype: Optional[str] = "float16",
			streaming: Optional[bool] = False,
			simu_streaming: Optional[bool] = False,
			fake_streaming: Optional[bool] = False,
			full_utt: Optional[bool] = False,
			chunk_size: Optional[int] = 16,
			left_context: Optional[int] = 16,
			@@ -1374,7 +1374,7 @@
			quantize_modules=quantize_modules,
			quantize_dtype=quantize_dtype,
			streaming=streaming,
			simu_streaming=simu_streaming,
			fake_streaming=fake_streaming,
			full_utt=full_utt,
			chunk_size=chunk_size,
			left_context=left_context,
			@@ -1432,8 +1432,8 @@
			final_hyps = speech2text.streaming_decode(
			speech[_end: len(speech)], is_final=True
			)
			elif speech2text.simu_streaming:
			final_hyps = speech2text.simu_streaming_decode(**batch)
			elif speech2text.fake_streaming:
			final_hyps = speech2text.fake_streaming_decode(**batch)
			elif speech2text.full_utt:
			final_hyps = speech2text.full_utt_decode(**batch)
			else:
			@@ -1823,7 +1823,7 @@
			group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
			group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
			group.add_argument("--streaming", type=str2bool, default=False)
			group.add_argument("--simu_streaming", type=str2bool, default=False)
			group.add_argument("--fake_streaming", type=str2bool, default=False)
			group.add_argument("--full_utt", type=str2bool, default=False)
			group.add_argument("--chunk_size", type=int, default=16)
			group.add_argument("--left_context", type=int, default=16)

			@@ -201,7 +201,7 @@
			self.seg_dict = None
			if seg_dict_file is not None:
			self.seg_dict = {}
			with open(seg_dict_file) as f:
			with open(seg_dict_file, "r", encoding="utf8") as f:
			lines = f.readlines()
			for line in lines:
			s = line.strip().split()

			@@ -21,9 +21,11 @@
			--host [host ip] \
			--port [server port] \
			--asr_model [asr model_name] \
			--vad_model [vad model_name] \
			--punc_model [punc model_name] \
			--ngpu [0 or 1] \
			--ncpu [1 or 4] \
			--hotword_path [path of hot word txt] \
			--certfile [path of certfile for ssl] \
			--keyfile [path of keyfile for ssl] \
			--temp_dir [upload file temp dir]
			@@ -45,3 +47,22 @@
			--add_pun [add pun to result] \
			--audio_path [use audio path]
			```


			## 支持多进程

			方法是启动多个`server.py`，然后通过Nginx的负载均衡分发请求，达到支持多用户同时连效果，处理方式如下，默认您已经安装了Nginx，没安装的请参考[官方安装教程](https://nginx.org/en/linux_packages.html#Ubuntu)。

			配置Nginx。
			```shell
			sudo cp -f asr_nginx.conf /etc/nginx/nginx.conf
			sudo service nginx reload
			```

			然后使用脚本启动多个服务，每个服务的端口号不一样。
			```shell
			sudo chmod +x start_server.sh
			./start_server.sh
			```

			说明：默认是3个进程，如果需要修改，首先修改`start_server.sh`的最后那部分，可以添加启动数量。然后修改`asr_nginx.conf`配置文件的`upstream backend`部分，增加新启动的服务，可以使其他服务器的服务。

New file
			@@ -0,0 +1,44 @@
			user nginx;
			worker_processes auto;

			error_log /var/log/nginx/error.log notice;
			pid /var/run/nginx.pid;

			events {
			worker_connections 1024;
			}


			http {
			include /etc/nginx/mime.types;
			default_type application/octet-stream;

			log_format main '$remote_addr - $remote_user [$time_local] "$request" '
			'$status $body_bytes_sent "$http_referer" '
			'"$http_user_agent" "$http_x_forwarded_for"';

			access_log /var/log/nginx/access.log main;

			sendfile on;
			keepalive_timeout 65;

			upstream backend {
			# 最少连接算法
			least_conn;
			# 启动的服务地址
			server localhost:8001;
			server localhost:8002;
			server localhost:8003;
			}

			server {
			# 实际访问的端口
			listen 8000;

			location / {
			proxy_pass http://backend;
			}
			}

			include /etc/nginx/conf.d/*.conf;
			}

			@@ -1,3 +1,5 @@
			import os

			import requests
			import argparse

			@@ -23,12 +25,16 @@
			required=False,
			help="use audio path")
			args = parser.parse_args()
			print("----------- Configuration Arguments -----------")
			for arg, value in vars(args).items():
			print("%s: %s" % (arg, value))
			print("------------------------------------------------")


			url = f'http://{args.host}:{args.port}/recognition'
			data = {'add_pun': args.add_pun}
			headers = {}
			files = [('audio', ('file', open(args.audio_path, 'rb'), 'application/octet-stream'))]
			files = [('audio', (os.path.basename(args.audio_path), open(args.audio_path, 'rb'), 'application/octet-stream'))]

			response = requests.post(url, headers=headers, data=data, files=files)
			print(response.text)

			@@ -1,8 +1,7 @@
			import argparse
			import logging
			import os
			import random
			import time
			import uuid

			import aiofiles
			import ffmpeg
			@@ -29,11 +28,15 @@
			parser.add_argument("--asr_model",
			type=str,
			default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
			help="model from modelscope")
			help="offline asr model from modelscope")
			parser.add_argument("--vad_model",
			type=str,
			default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
			help="vad model from modelscope")
			parser.add_argument("--punc_model",
			type=str,
			default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
			help="model from modelscope")
			default="damo/punc_ct-transformer_cn-en-common-vocab471067-large",
			help="punc model from modelscope")
			parser.add_argument("--ngpu",
			type=int,
			default=1,
			@@ -42,6 +45,10 @@
			type=int,
			default=4,
			help="cpu cores")
			parser.add_argument("--hotword_path",
			type=str,
			default=None,
			help="hot word txt path, only the hot word model works")
			parser.add_argument("--certfile",
			type=str,
			default=None,
			@@ -58,22 +65,30 @@
			required=False,
			help="temp dir")
			args = parser.parse_args()
			print("----------- Configuration Arguments -----------")
			for arg, value in vars(args).items():
			print("%s: %s" % (arg, value))
			print("------------------------------------------------")


			os.makedirs(args.temp_dir, exist_ok=True)

			print("model loading")
			param_dict = {}
			if args.hotword_path is not None and os.path.exists(args.hotword_path):
			param_dict['hotword'] = args.hotword_path
			# asr
			inference_pipeline_asr = pipeline(task=Tasks.auto_speech_recognition,
			model=args.asr_model,
			vad_model=args.vad_model,
			ngpu=args.ngpu,
			ncpu=args.ncpu,
			model_revision=None)
			param_dict=param_dict)
			print(f'loaded asr models.')

			if args.punc_model != "":
			inference_pipeline_punc = pipeline(task=Tasks.punctuation,
			model=args.punc_model,
			model_revision="v1.0.2",
			ngpu=args.ngpu,
			ncpu=args.ncpu)
			print(f'loaded pun models.')
			@@ -87,7 +102,7 @@
			async def api_recognition(audio: UploadFile = File(..., description="audio file"),
			add_pun: int = Body(1, description="add punctuation", embed=True)):
			suffix = audio.filename.split('.')[-1]
			audio_path = f'{args.temp_dir}/{int(time.time() * 1000)}_{random.randint(100, 999)}.{suffix}'
			audio_path = f'{args.temp_dir}/{str(uuid.uuid1())}.{suffix}'
			async with aiofiles.open(audio_path, 'wb') as out_file:
			content = await audio.read()
			await out_file.write(content)
			@@ -100,6 +115,7 @@
			if add_pun:
			rec_result = inference_pipeline_punc(text_in=rec_result['text'], param_dict={'cache': list()})
			ret = {"results": rec_result['text'], "code": 0}
			print(ret)
			return ret

New file
			@@ -0,0 +1,21 @@
			#!/bin/bash

			# 创建日志文件夹
			if [ ! -d "log/" ];then
			mkdir log
			fi

			# kill掉之前的进程
			server_id=`ps -ef \| grep server.py \| grep -v "grep" \| awk '{print $2}'`
			echo $server_id

			for id in $server_id
			do
			kill -9 $id
			echo "killed $id"
			done

			# 启动多个服务，可以设置使用不同的显卡
			CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8001 >> log/output1.log 2>&1 &
			CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8002 >> log/output2.log 2>&1 &
			CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8003 >> log/output3.log 2>&1 &

			@@ -107,6 +107,20 @@
			# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
			python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
			```

			#### Websocket api
			```shell
			# class Funasr_websocket_recognizer example with 3 step
			# 1.create an recognizer
			rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")
			# 2.send pcm data to asr engine and get asr result
			text=rcg.feed_chunk(data)
			print("text",text)
			# 3.get last result, set timeout=3
			text=rcg.close(timeout=3)
			print("text",text)
			```

			## Acknowledge
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
			2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service.

New file
			@@ -0,0 +1,134 @@
			'''
			Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
			Reserved. MIT License (https://opensource.org/licenses/MIT)

			2022-2023 by zhaomingwork@qq.com
			'''
			# pip install websocket-client
			import ssl
			from websocket import ABNF
			from websocket import create_connection
			from queue import Queue
			import threading
			import traceback
			import json
			import time
			import numpy as np
			# class for recognizer in websocket
			class Funasr_websocket_recognizer():
			'''
			python asr recognizer lib

			'''
			def __init__(self, host="127.0.0.1", port="30035", is_ssl=True,chunk_size="5, 10, 5",chunk_interval=10,mode="offline",wav_name="default"):
			'''
			host: server host ip
			port: server port
			is_ssl: True for wss protocal, False for ws
			'''
			try:
			if is_ssl == True:
			ssl_context = ssl.SSLContext()
			ssl_context.check_hostname = False
			ssl_context.verify_mode = ssl.CERT_NONE
			uri = "wss://{}:{}".format(host, port)
			ssl_opt={"cert_reqs": ssl.CERT_NONE}
			else:
			uri = "ws://{}:{}".format(host, port)
			ssl_context = None
			ssl_opt=None
			self.host = host
			self.port = port

			self.msg_queue = Queue() # used for recognized result text

			print("connect to url",uri)
			self.websocket=create_connection(uri,ssl=ssl_context,sslopt=ssl_opt)

			self.thread_msg = threading.Thread(target=Funasr_websocket_recognizer.thread_rec_msg,args=(self,))
			self.thread_msg.start()
			chunk_size = [int(x) for x in chunk_size.split(",")]
			stride = int(60 * chunk_size[1]/ chunk_interval / 1000 * 16000 * 2)
			chunk_num = (len(audio_bytes) - 1) // stride + 1

			message = json.dumps({"mode": mode, "chunk_size": chunk_size, "chunk_interval": chunk_interval,
			"wav_name": wav_name, "is_speaking": True})

			self.websocket.send(message)

			print("send json",message)

			except Exception as e:
			print("Exception:", e)
			traceback.print_exc()

			# threads for rev msg
			def thread_rec_msg(self):
			try:
			while(True):
			msg=self.websocket.recv()
			if msg is None or len(msg)==0:
			continue
			msg = json.loads(msg)

			self.msg_queue.put(msg)
			except Exception as e:
			print("client closed")

			# feed data to asr engine, wait_time means waiting for result until time out
			def feed_chunk(self, chunk,wait_time=0.01):
			try:
			self.websocket.send(chunk, ABNF.OPCODE_BINARY)
			# loop to check if there is a message, timeout in 0.01s
			while(True):
			msg = self.msg_queue.get(timeout=wait_time)
			if self.msg_queue.empty():
			break

			return msg
			except:
			return ""
			def close(self,timeout=1):
			message = json.dumps({"is_speaking": False})
			self.websocket.send(message)
			# sleep for timeout seconds to wait for result
			time.sleep(timeout)
			msg=""
			while(not self.msg_queue.empty()):
			msg = self.msg_queue.get()

			self.websocket.close()
			# only resturn the last msg
			return msg

			if __name__ == '__main__':
			print('example for Funasr_websocket_recognizer')
			import wave
			wav_path="asr_example.wav"
			with wave.open(wav_path, "rb") as wav_file:
			params = wav_file.getparams()
			frames = wav_file.readframes(wav_file.getnframes())
			audio_bytes = bytes(frames)


			stride = int(60 * 10 / 10 / 1000 * 16000 * 2)
			chunk_num = (len(audio_bytes) - 1) // stride + 1
			# create an recognizer
			rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")
			# loop to send chunk
			for i in range(chunk_num):

			beg = i * stride
			data = audio_bytes[beg:beg + stride]

			text=rcg.feed_chunk(data,wait_time=0.02)
			if len(text)>0:
			print("text",text)
			time.sleep(0.05)

			# get last message
			text=rcg.close(timeout=3)
			print("text",text)