| | |
| | | # server.py 注意本例仅处理单个clent发送的语音数据,并未对多client连接进行判断和处理 |
| | | import asyncio |
| | | import websockets |
| | | import time |
| | | from queue import Queue |
| | | import threading |
| | | import argparse |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | from modelscope.utils.logger import get_logger |
| | |
| | | logger = get_logger(log_level=logging.CRITICAL) |
| | | logger.setLevel(logging.CRITICAL) |
| | | |
| | | import asyncio |
| | | import websockets |
| | | import time |
| | | from queue import Queue |
| | | import threading |
| | | import argparse |
| | | |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument("--host", |
| | |
| | | |
| | | parser.add_argument("--punc_model", |
| | | type=str, |
| | | default="", |
| | | default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", |
| | | help="model from modelscope") |
| | | parser.add_argument("--ngpu", |
| | | type=int, |
| | |
| | | voices = Queue() |
| | | speek = Queue() |
| | | |
| | | # 创建一个VAD对象 |
| | | vad_pipline = pipeline( |
| | | # vad |
| | | inference_pipeline_vad = pipeline( |
| | | task=Tasks.voice_activity_detection, |
| | | model=args.vad_model, |
| | | model_revision="v1.2.0", |
| | | model_revision=None, |
| | | output_dir=None, |
| | | batch_size=1, |
| | | mode='online', |
| | | ngpu=args.ngpu, |
| | | ) |
| | | param_dict_vad = {'in_cache': dict(), "is_final": False} |
| | | |
| | | # 创建一个ASR对象 |
| | | param_dict = dict() |
| | | # asr |
| | | param_dict_asr = {} |
| | | # param_dict["hotword"] = "小五 小五月" # 设置热词,用空格隔开 |
| | | inference_pipeline2 = pipeline( |
| | | inference_pipeline_asr = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=args.asr_model, |
| | | param_dict=param_dict, |
| | | param_dict=param_dict_asr, |
| | | ngpu=args.ngpu, |
| | | ) |
| | | if args.punc_model is not None: |
| | | param_dict_punc = {'cache': list()} |
| | | inference_pipeline_punc = pipeline( |
| | | task=Tasks.punctuation, |
| | | model=args.punc_model, |
| | | model_revision=None, |
| | | ngpu=args.ngpu, |
| | | ) |
| | | else: |
| | | inference_pipeline_punc = None |
| | | |
| | | print("model loaded") |
| | | |
| | | |
| | |
| | | |
| | | |
| | | def vad(data): # 推理 |
| | | global vad_pipline |
| | | global vad_pipline, param_dict_vad |
| | | #print(type(data)) |
| | | segments_result = vad_pipline(audio_in=data) |
| | | #print(segments_result) |
| | | # print(param_dict_vad) |
| | | segments_result = inference_pipeline_vad(audio_in=data, param_dict=param_dict_vad) |
| | | # print(segments_result) |
| | | # print(param_dict_vad) |
| | | speech_start = False |
| | | speech_end = False |
| | | if len(segments_result) == 0 or len(segments_result["text"] > 1): |
| | | return False |
| | | elif segments_result["text"][0][0] != -1: |
| | | |
| | | if len(segments_result) == 0 or len(segments_result["text"]) > 1: |
| | | return speech_start, speech_end |
| | | if segments_result["text"][0][0] != -1: |
| | | speech_start = True |
| | | elif segments_result["text"][0][1] != -1: |
| | | if segments_result["text"][0][1] != -1: |
| | | speech_end = True |
| | | return speech_start, speech_end |
| | | |
| | | def asr(): # 推理 |
| | | global inference_pipeline2 |
| | | global speek |
| | | global speek, param_dict_punc |
| | | while True: |
| | | while not speek.empty(): |
| | | audio_in = speek.get() |
| | | speek.task_done() |
| | | rec_result = inference_pipeline2(audio_in=audio_in) |
| | | print(rec_result) |
| | | if len(audio_in) > 0: |
| | | rec_result = inference_pipeline_asr(audio_in=audio_in) |
| | | if inference_pipeline_punc is not None and 'text' in rec_result: |
| | | rec_result = inference_pipeline_punc(text_in=rec_result['text'], param_dict=param_dict_punc) |
| | | print(rec_result["text"]) |
| | | time.sleep(0.1) |
| | | time.sleep(0.1) |
| | | |
| | |
| | | frames.append(data) |
| | | RECORD_NUM += 1 |
| | | speech_start_i, speech_end_i = vad(data) |
| | | # print(speech_start_i, speech_end_i) |
| | | if speech_start_i: |
| | | speech_start = speech_start_i |
| | | # if not speech_detected: |
| | | print("检测到人声...") |
| | | # print("检测到人声...") |
| | | # speech_detected = True # 标记为检测到语音 |
| | | frames = [] |
| | | frames.extend(buffer) # 把之前2个语音数据快加入 |
| | | # silence_count = 0 # 重置静音次数 |
| | | elif speech_end_i or RECORD_NUM > 300: |
| | | if speech_end_i or RECORD_NUM > 300: |
| | | # silence_count += 1 # 增加静音次数 |
| | | # speech_end = speech_end_i |
| | | speech_start = False |
| | | # if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量 |
| | | print("说话结束或者超过设置最长时间...") |
| | | # print("说话结束或者超过设置最长时间...") |
| | | audio_in = b"".join(frames) |
| | | #asrt = threading.Thread(target=asr,args=(audio_in,)) |
| | | #asrt.start() |
| | |
| | | s.start() |
| | | |
| | | asyncio.get_event_loop().run_until_complete(start_server) |
| | | asyncio.get_event_loop().run_forever() |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | asyncio.get_event_loop().run_forever() |