Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add
| | |
| | | |
| | | ### Punctuation Restoration |
| | | |
| | | | Model Name | Training Data | Parameters | Vocab Size| Offline/Online | Notes | |
| | | |:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------| |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) | Alibaba Text Data | 70M | 272727 | Offline | offline punctuation model | |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) | Alibaba Text Data | 70M | 272727 | Online | online punctuation model | |
| | | | Model Name | Language | Training Data | Parameters | Vocab Size| Offline/Online | Notes | |
| | | |:--------------------------------------------------------------------------------------------------------------------------:|:---------|:----------------------------:|:----------:|:----------:|:--------------:|:------| |
| | | | [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) | CN & EN | Alibaba Text Data(100M) | 1.1G | 471067 | Offline | large offline punctuation model | |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) | CN & EN | Alibaba Text Data(70M) | 291M | 272727 | Offline | offline punctuation model | |
| | | | [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) | CN & EN | Alibaba Text Data(70M) | 288M | 272727 | Online | online punctuation model | |
| | | |
| | | ### Language Models |
| | | |
| | |
| | | |
| | | ### 标点恢复模型 |
| | | |
| | | | 模型名字 | 训练数据 | 模型参数 | Vocab Size| 非实时/实时 | 备注 | |
| | | |:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:--------| |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) | Alibaba Text Data | 70M | 272727 | 非实时 | 支持中英文标点 | |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) | Alibaba Text Data | 70M | 272727 | 实时 | VAD点实时 | |
| | | | 模型名字 | 语言 | 训练数据 | 模型参数 | Vocab Size| 非实时/实时 | 备注 | |
| | | |:--------------------------------------------------------------------------------------------------------------------------:|:----------:|:----------------------------:|:----------:|:----------:|:--------------:|:--------| |
| | | | [CT-Transformer-Large](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) | 中文和英文 | Alibaba Text Data(100M) | 1.1G | 471067 | 非实时 | 支持中英文标点大模型 | |
| | | | [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) | 中文和英文 | Alibaba Text Data(70M) | 291M | 272727 | 非实时 | 支持中英文标点 | |
| | | | [CT-Transformer-Realtime](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary) | 中文和英文 | Alibaba Text Data(70M) | 288M | 272727 | 实时 | VAD点实时标点 | |
| | | |
| | | ### 语音模型 |
| | | |
| | |
| | | quantize_dtype: str = "qint8", |
| | | nbest: int = 1, |
| | | streaming: bool = False, |
| | | simu_streaming: bool = False, |
| | | fake_streaming: bool = False, |
| | | full_utt: bool = False, |
| | | chunk_size: int = 16, |
| | | left_context: int = 32, |
| | |
| | | |
| | | self.beam_search = beam_search |
| | | self.streaming = streaming |
| | | self.simu_streaming = simu_streaming |
| | | self.fake_streaming = fake_streaming |
| | | self.full_utt = full_utt |
| | | self.chunk_size = max(chunk_size, 0) |
| | | self.left_context = left_context |
| | |
| | | self.streaming = False |
| | | self.asr_model.encoder.dynamic_chunk_training = False |
| | | |
| | | if not simu_streaming or chunk_size == 0: |
| | | self.simu_streaming = False |
| | | if not fake_streaming or chunk_size == 0: |
| | | self.fake_streaming = False |
| | | self.asr_model.encoder.dynamic_chunk_training = False |
| | | |
| | | self.frontend = frontend |
| | |
| | | return nbest_hyps |
| | | |
| | | @torch.no_grad() |
| | | def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]: |
| | | def fake_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[HypothesisTransducer]: |
| | | """Speech2Text call. |
| | | Args: |
| | | speech: Speech data. (S) |
| | |
| | | else: |
| | | text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | if timestamp_postprocessed != "" or len(timestamp) == 0: |
| | | if timestamp_postprocessed != "": |
| | | item['timestamp'] = timestamp_postprocessed |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | |
| | | item = {'key': key, 'value': text_postprocessed_punc} |
| | | if text_postprocessed != "": |
| | | item['text_postprocessed'] = text_postprocessed |
| | | if time_stamp_postprocessed != "" or len(time_stamp) == 0: |
| | | if time_stamp_postprocessed != "": |
| | | item['time_stamp'] = time_stamp_postprocessed |
| | | |
| | | item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed) |
| | |
| | | quantize_modules: Optional[List[str]] = None, |
| | | quantize_dtype: Optional[str] = "float16", |
| | | streaming: Optional[bool] = False, |
| | | simu_streaming: Optional[bool] = False, |
| | | fake_streaming: Optional[bool] = False, |
| | | full_utt: Optional[bool] = False, |
| | | chunk_size: Optional[int] = 16, |
| | | left_context: Optional[int] = 16, |
| | |
| | | quantize_modules=quantize_modules, |
| | | quantize_dtype=quantize_dtype, |
| | | streaming=streaming, |
| | | simu_streaming=simu_streaming, |
| | | fake_streaming=fake_streaming, |
| | | full_utt=full_utt, |
| | | chunk_size=chunk_size, |
| | | left_context=left_context, |
| | |
| | | final_hyps = speech2text.streaming_decode( |
| | | speech[_end: len(speech)], is_final=True |
| | | ) |
| | | elif speech2text.simu_streaming: |
| | | final_hyps = speech2text.simu_streaming_decode(**batch) |
| | | elif speech2text.fake_streaming: |
| | | final_hyps = speech2text.fake_streaming_decode(**batch) |
| | | elif speech2text.full_utt: |
| | | final_hyps = speech2text.full_utt_decode(**batch) |
| | | else: |
| | |
| | | group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") |
| | | group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") |
| | | group.add_argument("--streaming", type=str2bool, default=False) |
| | | group.add_argument("--simu_streaming", type=str2bool, default=False) |
| | | group.add_argument("--fake_streaming", type=str2bool, default=False) |
| | | group.add_argument("--full_utt", type=str2bool, default=False) |
| | | group.add_argument("--chunk_size", type=int, default=16) |
| | | group.add_argument("--left_context", type=int, default=16) |
| | |
| | | self.seg_dict = None |
| | | if seg_dict_file is not None: |
| | | self.seg_dict = {} |
| | | with open(seg_dict_file) as f: |
| | | with open(seg_dict_file, "r", encoding="utf8") as f: |
| | | lines = f.readlines() |
| | | for line in lines: |
| | | s = line.strip().split() |
| | |
| | | --host [host ip] \ |
| | | --port [server port] \ |
| | | --asr_model [asr model_name] \ |
| | | --vad_model [vad model_name] \ |
| | | --punc_model [punc model_name] \ |
| | | --ngpu [0 or 1] \ |
| | | --ncpu [1 or 4] \ |
| | | --hotword_path [path of hot word txt] \ |
| | | --certfile [path of certfile for ssl] \ |
| | | --keyfile [path of keyfile for ssl] \ |
| | | --temp_dir [upload file temp dir] |
| | |
| | | --add_pun [add pun to result] \ |
| | | --audio_path [use audio path] |
| | | ``` |
| | | |
| | | |
| | | ## 支持多进程 |
| | | |
| | | 方法是启动多个`server.py`,然后通过Nginx的负载均衡分发请求,达到支持多用户同时连效果,处理方式如下,默认您已经安装了Nginx,没安装的请参考[官方安装教程](https://nginx.org/en/linux_packages.html#Ubuntu)。 |
| | | |
| | | 配置Nginx。 |
| | | ```shell |
| | | sudo cp -f asr_nginx.conf /etc/nginx/nginx.conf |
| | | sudo service nginx reload |
| | | ``` |
| | | |
| | | 然后使用脚本启动多个服务,每个服务的端口号不一样。 |
| | | ```shell |
| | | sudo chmod +x start_server.sh |
| | | ./start_server.sh |
| | | ``` |
| | | |
| | | **说明:** 默认是3个进程,如果需要修改,首先修改`start_server.sh`的最后那部分,可以添加启动数量。然后修改`asr_nginx.conf`配置文件的`upstream backend`部分,增加新启动的服务,可以使其他服务器的服务。 |
| New file |
| | |
| | | user nginx; |
| | | worker_processes auto; |
| | | |
| | | error_log /var/log/nginx/error.log notice; |
| | | pid /var/run/nginx.pid; |
| | | |
| | | events { |
| | | worker_connections 1024; |
| | | } |
| | | |
| | | |
| | | http { |
| | | include /etc/nginx/mime.types; |
| | | default_type application/octet-stream; |
| | | |
| | | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' |
| | | '$status $body_bytes_sent "$http_referer" ' |
| | | '"$http_user_agent" "$http_x_forwarded_for"'; |
| | | |
| | | access_log /var/log/nginx/access.log main; |
| | | |
| | | sendfile on; |
| | | keepalive_timeout 65; |
| | | |
| | | upstream backend { |
| | | # 最少连接算法 |
| | | least_conn; |
| | | # 启动的服务地址 |
| | | server localhost:8001; |
| | | server localhost:8002; |
| | | server localhost:8003; |
| | | } |
| | | |
| | | server { |
| | | # 实际访问的端口 |
| | | listen 8000; |
| | | |
| | | location / { |
| | | proxy_pass http://backend; |
| | | } |
| | | } |
| | | |
| | | include /etc/nginx/conf.d/*.conf; |
| | | } |
| | |
| | | import os |
| | | |
| | | import requests |
| | | import argparse |
| | | |
| | |
| | | required=False, |
| | | help="use audio path") |
| | | args = parser.parse_args() |
| | | print("----------- Configuration Arguments -----------") |
| | | for arg, value in vars(args).items(): |
| | | print("%s: %s" % (arg, value)) |
| | | print("------------------------------------------------") |
| | | |
| | | |
| | | url = f'http://{args.host}:{args.port}/recognition' |
| | | data = {'add_pun': args.add_pun} |
| | | headers = {} |
| | | files = [('audio', ('file', open(args.audio_path, 'rb'), 'application/octet-stream'))] |
| | | files = [('audio', (os.path.basename(args.audio_path), open(args.audio_path, 'rb'), 'application/octet-stream'))] |
| | | |
| | | response = requests.post(url, headers=headers, data=data, files=files) |
| | | print(response.text) |
| | |
| | | import argparse |
| | | import logging |
| | | import os |
| | | import random |
| | | import time |
| | | import uuid |
| | | |
| | | import aiofiles |
| | | import ffmpeg |
| | |
| | | parser.add_argument("--asr_model", |
| | | type=str, |
| | | default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | help="model from modelscope") |
| | | help="offline asr model from modelscope") |
| | | parser.add_argument("--vad_model", |
| | | type=str, |
| | | default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | help="vad model from modelscope") |
| | | parser.add_argument("--punc_model", |
| | | type=str, |
| | | default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", |
| | | help="model from modelscope") |
| | | default="damo/punc_ct-transformer_cn-en-common-vocab471067-large", |
| | | help="punc model from modelscope") |
| | | parser.add_argument("--ngpu", |
| | | type=int, |
| | | default=1, |
| | |
| | | type=int, |
| | | default=4, |
| | | help="cpu cores") |
| | | parser.add_argument("--hotword_path", |
| | | type=str, |
| | | default=None, |
| | | help="hot word txt path, only the hot word model works") |
| | | parser.add_argument("--certfile", |
| | | type=str, |
| | | default=None, |
| | |
| | | required=False, |
| | | help="temp dir") |
| | | args = parser.parse_args() |
| | | print("----------- Configuration Arguments -----------") |
| | | for arg, value in vars(args).items(): |
| | | print("%s: %s" % (arg, value)) |
| | | print("------------------------------------------------") |
| | | |
| | | |
| | | os.makedirs(args.temp_dir, exist_ok=True) |
| | | |
| | | print("model loading") |
| | | param_dict = {} |
| | | if args.hotword_path is not None and os.path.exists(args.hotword_path): |
| | | param_dict['hotword'] = args.hotword_path |
| | | # asr |
| | | inference_pipeline_asr = pipeline(task=Tasks.auto_speech_recognition, |
| | | model=args.asr_model, |
| | | vad_model=args.vad_model, |
| | | ngpu=args.ngpu, |
| | | ncpu=args.ncpu, |
| | | model_revision=None) |
| | | param_dict=param_dict) |
| | | print(f'loaded asr models.') |
| | | |
| | | if args.punc_model != "": |
| | | inference_pipeline_punc = pipeline(task=Tasks.punctuation, |
| | | model=args.punc_model, |
| | | model_revision="v1.0.2", |
| | | ngpu=args.ngpu, |
| | | ncpu=args.ncpu) |
| | | print(f'loaded pun models.') |
| | |
| | | async def api_recognition(audio: UploadFile = File(..., description="audio file"), |
| | | add_pun: int = Body(1, description="add punctuation", embed=True)): |
| | | suffix = audio.filename.split('.')[-1] |
| | | audio_path = f'{args.temp_dir}/{int(time.time() * 1000)}_{random.randint(100, 999)}.{suffix}' |
| | | audio_path = f'{args.temp_dir}/{str(uuid.uuid1())}.{suffix}' |
| | | async with aiofiles.open(audio_path, 'wb') as out_file: |
| | | content = await audio.read() |
| | | await out_file.write(content) |
| | |
| | | if add_pun: |
| | | rec_result = inference_pipeline_punc(text_in=rec_result['text'], param_dict={'cache': list()}) |
| | | ret = {"results": rec_result['text'], "code": 0} |
| | | print(ret) |
| | | return ret |
| | | |
| | | |
| New file |
| | |
| | | #!/bin/bash |
| | | |
| | | # 创建日志文件夹 |
| | | if [ ! -d "log/" ];then |
| | | mkdir log |
| | | fi |
| | | |
| | | # kill掉之前的进程 |
| | | server_id=`ps -ef | grep server.py | grep -v "grep" | awk '{print $2}'` |
| | | echo $server_id |
| | | |
| | | for id in $server_id |
| | | do |
| | | kill -9 $id |
| | | echo "killed $id" |
| | | done |
| | | |
| | | # 启动多个服务,可以设置使用不同的显卡 |
| | | CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8001 >> log/output1.log 2>&1 & |
| | | CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8002 >> log/output2.log 2>&1 & |
| | | CUDA_VISIBLE_DEVICES=0 nohup python -u server.py --host=localhost --port=8003 >> log/output3.log 2>&1 & |
| | |
| | | # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms |
| | | python funasr_wss_client.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results" |
| | | ``` |
| | | |
| | | #### Websocket api |
| | | ```shell |
| | | # class Funasr_websocket_recognizer example with 3 step |
| | | # 1.create an recognizer |
| | | rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass") |
| | | # 2.send pcm data to asr engine and get asr result |
| | | text=rcg.feed_chunk(data) |
| | | print("text",text) |
| | | # 3.get last result, set timeout=3 |
| | | text=rcg.close(timeout=3) |
| | | print("text",text) |
| | | ``` |
| | | |
| | | ## Acknowledge |
| | | 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR). |
| | | 2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service. |
| New file |
| | |
| | | '''
|
| | | Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
|
| | | Reserved. MIT License (https://opensource.org/licenses/MIT)
|
| | | |
| | | 2022-2023 by zhaomingwork@qq.com |
| | | '''
|
| | | # pip install websocket-client
|
| | | import ssl
|
| | | from websocket import ABNF
|
| | | from websocket import create_connection
|
| | | from queue import Queue
|
| | | import threading
|
| | | import traceback
|
| | | import json
|
| | | import time
|
| | | import numpy as np
|
| | | # class for recognizer in websocket
|
| | | class Funasr_websocket_recognizer():
|
| | | '''
|
| | | python asr recognizer lib
|
| | |
|
| | | '''
|
| | | def __init__(self, host="127.0.0.1", port="30035", is_ssl=True,chunk_size="5, 10, 5",chunk_interval=10,mode="offline",wav_name="default"):
|
| | | '''
|
| | | host: server host ip
|
| | | port: server port
|
| | | is_ssl: True for wss protocal, False for ws
|
| | | '''
|
| | | try:
|
| | | if is_ssl == True:
|
| | | ssl_context = ssl.SSLContext()
|
| | | ssl_context.check_hostname = False
|
| | | ssl_context.verify_mode = ssl.CERT_NONE
|
| | | uri = "wss://{}:{}".format(host, port)
|
| | | ssl_opt={"cert_reqs": ssl.CERT_NONE}
|
| | | else:
|
| | | uri = "ws://{}:{}".format(host, port)
|
| | | ssl_context = None
|
| | | ssl_opt=None
|
| | | self.host = host
|
| | | self.port = port
|
| | | |
| | | self.msg_queue = Queue() # used for recognized result text
|
| | |
|
| | | print("connect to url",uri)
|
| | | self.websocket=create_connection(uri,ssl=ssl_context,sslopt=ssl_opt)
|
| | | |
| | | self.thread_msg = threading.Thread(target=Funasr_websocket_recognizer.thread_rec_msg,args=(self,))
|
| | | self.thread_msg.start()
|
| | | chunk_size = [int(x) for x in chunk_size.split(",")]
|
| | | stride = int(60 * chunk_size[1]/ chunk_interval / 1000 * 16000 * 2)
|
| | | chunk_num = (len(audio_bytes) - 1) // stride + 1
|
| | | |
| | | message = json.dumps({"mode": mode, "chunk_size": chunk_size, "chunk_interval": chunk_interval,
|
| | | "wav_name": wav_name, "is_speaking": True})
|
| | | |
| | | self.websocket.send(message)
|
| | | |
| | | print("send json",message)
|
| | | |
| | | except Exception as e:
|
| | | print("Exception:", e)
|
| | | traceback.print_exc()
|
| | | |
| | | # threads for rev msg
|
| | | def thread_rec_msg(self):
|
| | | try:
|
| | | while(True):
|
| | | msg=self.websocket.recv()
|
| | | if msg is None or len(msg)==0:
|
| | | continue
|
| | | msg = json.loads(msg)
|
| | | |
| | | self.msg_queue.put(msg)
|
| | | except Exception as e:
|
| | | print("client closed")
|
| | | |
| | | # feed data to asr engine, wait_time means waiting for result until time out
|
| | | def feed_chunk(self, chunk,wait_time=0.01):
|
| | | try:
|
| | | self.websocket.send(chunk, ABNF.OPCODE_BINARY)
|
| | | # loop to check if there is a message, timeout in 0.01s
|
| | | while(True):
|
| | | msg = self.msg_queue.get(timeout=wait_time)
|
| | | if self.msg_queue.empty():
|
| | | break
|
| | | |
| | | return msg
|
| | | except:
|
| | | return ""
|
| | | def close(self,timeout=1):
|
| | | message = json.dumps({"is_speaking": False})
|
| | | self.websocket.send(message)
|
| | | # sleep for timeout seconds to wait for result
|
| | | time.sleep(timeout)
|
| | | msg=""
|
| | | while(not self.msg_queue.empty()):
|
| | | msg = self.msg_queue.get()
|
| | | |
| | | self.websocket.close()
|
| | | # only resturn the last msg
|
| | | return msg
|
| | | |
| | | if __name__ == '__main__':
|
| | | print('example for Funasr_websocket_recognizer') |
| | | import wave
|
| | | wav_path="asr_example.wav"
|
| | | with wave.open(wav_path, "rb") as wav_file:
|
| | | params = wav_file.getparams()
|
| | | frames = wav_file.readframes(wav_file.getnframes())
|
| | | audio_bytes = bytes(frames)
|
| | | |
| | | |
| | | stride = int(60 * 10 / 10 / 1000 * 16000 * 2)
|
| | | chunk_num = (len(audio_bytes) - 1) // stride + 1
|
| | | # create an recognizer |
| | | rcg=Funasr_websocket_recognizer(host="127.0.0.1",port="30035",is_ssl=True,mode="2pass")
|
| | | # loop to send chunk
|
| | | for i in range(chunk_num):
|
| | |
|
| | | beg = i * stride
|
| | | data = audio_bytes[beg:beg + stride]
|
| | | |
| | | text=rcg.feed_chunk(data,wait_time=0.02)
|
| | | if len(text)>0:
|
| | | print("text",text)
|
| | | time.sleep(0.05)
|
| | | |
| | | # get last message
|
| | | text=rcg.close(timeout=3)
|
| | | print("text",text)
|
| | | |
| | | |
| | | |