Merge branch 'alibaba-damo-academy:main' into main
20个文件已修改
2个文件已添加
14 文件已重命名
2个文件已删除
1 文件已复制
| | |
| | | import os |
| | | import logging |
| | | import torch |
| | | import torchaudio |
| | | import soundfile |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | from modelscope.utils.logger import get_logger |
| | | import logging |
| | | |
| | | logger = get_logger(log_level=logging.CRITICAL) |
| | | logger.setLevel(logging.CRITICAL) |
| | | |
| | | os.environ["MODELSCOPE_CACHE"] = "./" |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', |
| | | model_revision='v1.0.2') |
| | | |
| | | waveform, sample_rate = torchaudio.load("waihu.wav") |
| | | speech_length = waveform.shape[1] |
| | | speech = waveform[0] |
| | | model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online") |
| | | speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav")) |
| | | speech_length = speech.shape[0] |
| | | |
| | | cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None} |
| | | cache_de = {"decode_fsmn": None} |
| | | cache = {"encoder": cache_en, "decoder": cache_de} |
| | | param_dict = {} |
| | | param_dict["cache"] = cache |
| | | |
| | | first_chunk = True |
| | | speech_buffer = speech |
| | | speech_cache = [] |
| | | sample_offset = 0 |
| | | step = 4800 #300ms |
| | | param_dict = {"cache": dict(), "is_final": False} |
| | | final_result = "" |
| | | |
| | | while len(speech_buffer) >= 960: |
| | | if first_chunk: |
| | | if len(speech_buffer) >= 14400: |
| | | rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict) |
| | | speech_buffer = speech_buffer[4800:] |
| | | else: |
| | | cache_en["stride"] = len(speech_buffer) // 960 |
| | | cache_en["pad_right"] = 0 |
| | | rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict) |
| | | speech_buffer = [] |
| | | cache_en["start_idx"] = -5 |
| | | first_chunk = False |
| | | else: |
| | | cache_en["start_idx"] += 10 |
| | | if len(speech_buffer) >= 4800: |
| | | cache_en["pad_left"] = 5 |
| | | rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict) |
| | | speech_buffer = speech_buffer[9600:] |
| | | else: |
| | | cache_en["stride"] = len(speech_buffer) // 960 |
| | | cache_en["pad_right"] = 0 |
| | | rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict) |
| | | speech_buffer = [] |
| | | if len(rec_result) !=0 and rec_result['text'] != "sil": |
| | | for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)): |
| | | if sample_offset + step >= speech_length - 1: |
| | | step = speech_length - sample_offset |
| | | param_dict["is_final"] = True |
| | | rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step], |
| | | param_dict=param_dict) |
| | | if len(rec_result) != 0 and rec_result['text'] != "sil" and rec_result['text'] != "waiting_for_more_voice": |
| | | final_result += rec_result['text'] |
| | | print(rec_result) |
| | | print(final_result) |
| | |
| | | sample_offset = 0 |
| | | |
| | | step = 160 * 10 |
| | | param_dict = {'in_cache': dict()} |
| | | param_dict = {'in_cache': dict(), 'max_end_sil': 800} |
| | | for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)): |
| | | if sample_offset + step >= speech_length - 1: |
| | | step = speech_length - sample_offset |
| | |
| | | sample_offset = 0 |
| | | |
| | | step = 80 * 10 |
| | | param_dict = {'in_cache': dict()} |
| | | param_dict = {'in_cache': dict(), 'max_end_sil': 800} |
| | | for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)): |
| | | if sample_offset + step >= speech_length - 1: |
| | | step = speech_length - sample_offset |
| | |
| | | ) |
| | | |
| | | export_mode = False |
| | | if param_dict is not None: |
| | | hotword_list_or_file = param_dict.get('hotword') |
| | | export_mode = param_dict.get("export_mode", False) |
| | | else: |
| | | hotword_list_or_file = None |
| | | |
| | | if ngpu >= 1 and torch.cuda.is_available(): |
| | | device = "cuda" |
| | |
| | | ngram_weight=ngram_weight, |
| | | penalty=penalty, |
| | | nbest=nbest, |
| | | hotword_list_or_file=hotword_list_or_file, |
| | | ) |
| | | if export_mode: |
| | | speech2text = Speech2TextExport(**speech2text_kwargs) |
| | |
| | | **kwargs, |
| | | ): |
| | | |
| | | hotword_list_or_file = None |
| | | if param_dict is not None: |
| | | hotword_list_or_file = param_dict.get('hotword') |
| | | if 'hotword' in kwargs: |
| | | hotword_list_or_file = kwargs['hotword'] |
| | | if hotword_list_or_file is not None or 'hotword' in kwargs: |
| | | speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file) |
| | | |
| | | # 3. Build data-iterator |
| | | if data_path_and_name_and_type is None and raw_inputs is not None: |
| | | if isinstance(raw_inputs, torch.Tensor): |
| | | raw_inputs = raw_inputs.numpy() |
| | | data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] |
| | | loader = ASRTask.build_streaming_iterator( |
| | | data_path_and_name_and_type, |
| | | dtype=dtype, |
| | | fs=fs, |
| | | batch_size=batch_size, |
| | | key_file=key_file, |
| | | num_workers=num_workers, |
| | | preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), |
| | | collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), |
| | | allow_variable_data_keys=allow_variable_data_keys, |
| | | inference=True, |
| | | ) |
| | | if isinstance(raw_inputs, np.ndarray): |
| | | raw_inputs = torch.tensor(raw_inputs) |
| | | |
| | | if param_dict is not None: |
| | | use_timestamp = param_dict.get('use_timestamp', True) |
| | | else: |
| | | use_timestamp = True |
| | | |
| | | forward_time_total = 0.0 |
| | | length_total = 0.0 |
| | | finish_count = 0 |
| | | file_count = 1 |
| | | cache = None |
| | | is_final = False |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | | if param_dict is not None and "is_final" in param_dict: |
| | | is_final = param_dict["is_final"] |
| | | # 7 .Start for-loop |
| | | # FIXME(kamo): The output format should be discussed about |
| | | asr_result_list = [] |
| | | output_path = output_dir_v2 if output_dir_v2 is not None else output_dir |
| | | if output_path is not None: |
| | | writer = DatadirWriter(output_path) |
| | | results = [] |
| | | asr_result = "" |
| | | wait = True |
| | | if len(cache) == 0: |
| | | cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None} |
| | | cache_de = {"decode_fsmn": None} |
| | | cache["decoder"] = cache_de |
| | | cache["first_chunk"] = True |
| | | cache["speech"] = [] |
| | | cache["chunk_index"] = 0 |
| | | cache["speech_chunk"] = [] |
| | | |
| | | if raw_inputs is not None: |
| | | if len(cache["speech"]) == 0: |
| | | cache["speech"] = raw_inputs |
| | | else: |
| | | cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0) |
| | | if len(cache["speech_chunk"]) == 0: |
| | | cache["speech_chunk"] = raw_inputs |
| | | else: |
| | | cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0) |
| | | while len(cache["speech_chunk"]) >= 960: |
| | | if cache["first_chunk"]: |
| | | if len(cache["speech_chunk"]) >= 14400: |
| | | speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0) |
| | | speech_length = torch.tensor([14400]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"]= cache["speech_chunk"][4800:] |
| | | cache["first_chunk"] = False |
| | | cache["encoder"]["start_idx"] = -5 |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | wait = False |
| | | else: |
| | | break |
| | | else: |
| | | if len(cache["speech_chunk"]) >= 19200: |
| | | cache["encoder"]["start_idx"] += 10 |
| | | cache["encoder"]["pad_left"] = 5 |
| | | speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0) |
| | | speech_length = torch.tensor([19200]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = cache["speech_chunk"][9600:] |
| | | wait = False |
| | | else: |
| | | if is_final: |
| | | cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960 |
| | | cache["encoder"]["pad_right"] = 0 |
| | | speech = torch.unsqueeze(cache["speech_chunk"], axis=0) |
| | | speech_length = torch.tensor([len(cache["speech_chunk"])]) |
| | | results = speech2text(cache, speech, speech_length) |
| | | cache["speech_chunk"] = [] |
| | | wait = False |
| | | else: |
| | | break |
| | | |
| | | if len(results) >= 1: |
| | | asr_result += results[0][0] |
| | | if asr_result == "": |
| | | asr_result = "sil" |
| | | if wait: |
| | | asr_result = "waiting_for_more_voice" |
| | | item = {'key': "utt", 'value': asr_result} |
| | | asr_result_list.append(item) |
| | | else: |
| | | writer = None |
| | | if param_dict is not None and "cache" in param_dict: |
| | | cache = param_dict["cache"] |
| | | for keys, batch in loader: |
| | | assert isinstance(batch, dict), type(batch) |
| | | assert all(isinstance(s, str) for s in keys), keys |
| | | _bs = len(next(iter(batch.values()))) |
| | | assert len(keys) == _bs, f"{len(keys)} != {_bs}" |
| | | # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} |
| | | logging.info("decoding, utt_id: {}".format(keys)) |
| | | # N-best list of (text, token, token_int, hyp_object) |
| | | |
| | | time_beg = time.time() |
| | | results = speech2text(cache=cache, **batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest |
| | | time_end = time.time() |
| | | forward_time = time_end - time_beg |
| | | lfr_factor = results[0][-1] |
| | | length = results[0][-2] |
| | | forward_time_total += forward_time |
| | | length_total += length |
| | | rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time, |
| | | 100 * forward_time / ( |
| | | length * lfr_factor)) |
| | | logging.info(rtf_cur) |
| | | |
| | | for batch_id in range(_bs): |
| | | result = [results[batch_id][:-2]] |
| | | |
| | | key = keys[batch_id] |
| | | for n, result in zip(range(1, nbest + 1), result): |
| | | text, token, token_int, hyp = result[0], result[1], result[2], result[3] |
| | | time_stamp = None if len(result) < 5 else result[4] |
| | | # Create a directory: outdir/{n}best_recog |
| | | if writer is not None: |
| | | ibest_writer = writer[f"{n}best_recog"] |
| | | |
| | | # Write the result to each file |
| | | ibest_writer["token"][key] = " ".join(token) |
| | | # ibest_writer["token_int"][key] = " ".join(map(str, token_int)) |
| | | ibest_writer["score"][key] = str(hyp.score) |
| | | ibest_writer["rtf"][key] = rtf_cur |
| | | |
| | | if text is not None: |
| | | if use_timestamp and time_stamp is not None: |
| | | postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) |
| | | else: |
| | | postprocessed_result = postprocess_utils.sentence_postprocess(token) |
| | | time_stamp_postprocessed = "" |
| | | if len(postprocessed_result) == 3: |
| | | text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \ |
| | | postprocessed_result[1], \ |
| | | postprocessed_result[2] |
| | | else: |
| | | text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] |
| | | item = {'key': key, 'value': text_postprocessed} |
| | | if time_stamp_postprocessed != "": |
| | | item['time_stamp'] = time_stamp_postprocessed |
| | | asr_result_list.append(item) |
| | | finish_count += 1 |
| | | # asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | |
| | | logging.info("decoding, utt: {}, predictions: {}".format(key, text)) |
| | | rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, |
| | | forward_time_total, |
| | | 100 * forward_time_total / ( |
| | | length_total * lfr_factor)) |
| | | logging.info(rtf_avg) |
| | | if writer is not None: |
| | | ibest_writer["rtf"]["rtf_avf"] = rtf_avg |
| | | return [] |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav') |
| | | # print(rec_result) |
| | | |
| | | |
| | |
| | | |
| | | # Change integer-ids to tokens |
| | | token = self.converter.ids2tokens(token_int) |
| | | token = list(filter(lambda x: x != "<gbg>", token)) |
| | | |
| | | if self.tokenizer is not None: |
| | | text = self.tokenizer.tokens2text(token) |
| | |
| | | finish_count += 1 |
| | | asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | |
| | | # Change integer-ids to tokens |
| | | token = self.converter.ids2tokens(token_int) |
| | | token = list(filter(lambda x: x != "<gbg>", token)) |
| | | |
| | | if self.tokenizer is not None: |
| | | text = self.tokenizer.tokens2text(token) |
| | |
| | | finish_count += 1 |
| | | asr_utils.print_progress(finish_count / file_count) |
| | | if writer is not None: |
| | | ibest_writer["text"][key] = text |
| | | ibest_writer["text"][key] = text_postprocessed |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | |
| | | from funasr.models.frontend.wav_frontend import WavFrontend |
| | | from funasr.bin.vad_inference import Speech2VadSegment |
| | | |
| | | |
| | | header_colors = '\033[95m' |
| | | end_colors = '\033[0m' |
| | | |
| | | |
| | | class Speech2VadSegmentOnline(Speech2VadSegment): |
| | |
| | | @torch.no_grad() |
| | | def __call__( |
| | | self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, |
| | | in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False |
| | | in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800 |
| | | ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]: |
| | | """Inference |
| | | |
| | |
| | | "feats": feats, |
| | | "waveform": waveforms, |
| | | "in_cache": in_cache, |
| | | "is_final": is_final |
| | | "is_final": is_final, |
| | | "max_end_sil": max_end_sil |
| | | } |
| | | # a. To device |
| | | batch = to_device(batch, device=self.device) |
| | |
| | | vad_results = [] |
| | | batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict() |
| | | is_final = param_dict['is_final'] if param_dict is not None else False |
| | | max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800 |
| | | for keys, batch in loader: |
| | | assert isinstance(batch, dict), type(batch) |
| | | assert all(isinstance(s, str) for s in keys), keys |
| | |
| | | assert len(keys) == _bs, f"{len(keys)} != {_bs}" |
| | | batch['in_cache'] = batch_in_cache |
| | | batch['is_final'] = is_final |
| | | batch['max_end_sil'] = max_end_sil |
| | | |
| | | # do vad segment |
| | | _, results, param_dict['in_cache'] = speech2vadsegment(**batch) |
old mode 100755
new mode 100644
| | |
| | | return segments, in_cache |
| | | |
| | | def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(), |
| | | is_final: bool = False |
| | | is_final: bool = False, max_end_sil: int = 800 |
| | | ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]: |
| | | self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres |
| | | self.waveform = waveform # compute decibel for each frame |
| | | self.ComputeDecibel() |
| | | self.ComputeScores(feats, in_cache) |
| | |
| | | The audio data is in streaming, the asr inference process is in offline. |
| | | |
| | | |
| | | ## Steps |
| | | |
| | | Step 1-1) Prepare server modelscope pipeline environment (on server). |
| | | ## For the Server |
| | | |
| | | Install modelscope and funasr with pip or with cuda-docker image. |
| | | ### Prepare server environment |
| | | #### Backend is modelscope pipeline (default) |
| | | Install the modelscope and funasr |
| | | |
| | | Option 1: Install modelscope and funasr with [pip](https://github.com/alibaba-damo-academy/FunASR#installation) |
| | | |
| | | Option 2: or install with cuda-docker image as: |
| | | |
| | | ``` |
| | | CID=`docker run --network host -d -it --gpus '"device=0"' registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.2.0` |
| | | echo $CID |
| | | docker exec -it $CID /bin/bash |
| | | ``` |
| | | Get funasr source code and get into grpc directory. |
| | | ``` |
| | | git clone https://github.com/alibaba-damo-academy/FunASR |
| | | cd FunASR/funasr/runtime/python/grpc/ |
| | | ``` |
| | | |
| | | Step 1-2) Optional, Prepare server onnxruntime environment (on server). |
| | | |
| | | Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | |
| | | - Build the onnx_paraformer `whl` |
| | | ``` |
| | | ```shell |
| | | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/onnxruntime/rapid_paraformer |
| | | python setup.py build |
| | | python setup.py install |
| | | pip install --editable ./ |
| | | ``` |
| | | |
| | | [//]: # () |
| | | [//]: # (- Install the build `whl`) |
| | | Install the requirements |
| | | |
| | | [//]: # (```) |
| | | ```shell |
| | | cd funasr/runtime/python/grpc |
| | | pip install -r requirements_server.txt |
| | | ``` |
| | | |
| | | [//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl) |
| | | #### Backend is funasr_onnx (optional) |
| | | |
| | | [//]: # (```) |
| | | Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | |
| | | ``` |
| | | pip install funasr_onnx -i https://pypi.Python.org/simple |
| | | ``` |
| | | |
| | | Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | ```shell |
| | | python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True |
| | | ``` |
| | | |
| | | Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client). |
| | | ``` |
| | | # Optional, Install dependency. |
| | | python -m pip install grpcio grpcio-tools |
| | | ``` |
| | | ### Generate protobuf file |
| | | Run on server, the two generated pb files are both used for server and client |
| | | |
| | | ``` |
| | | ```shell |
| | | # paraformer_pb2.py and paraformer_pb2_grpc.py are already generated, |
| | | # regenerate it only when you make changes to ./proto/paraformer.proto file. |
| | | python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto |
| | | ``` |
| | | |
| | | Step 3) Start grpc server (on server). |
| | | ``` |
| | | # Optional, Install dependency. |
| | | python -m pip install grpcio grpcio-tools |
| | | ``` |
| | | ### Start grpc server |
| | | |
| | | ``` |
| | | # Start server. |
| | | python grpc_main_server.py --port 10095 --backend pipeline |
| | | ``` |
| | | |
| | | If you want run server with onnxruntime, please set `backend` and `onnx_dir` paramater. |
| | | If you want run server with onnxruntime, please set `backend` and `onnx_dir`. |
| | | ``` |
| | | # Start server. |
| | | python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch |
| | | ``` |
| | | |
| | | ## For the client |
| | | |
| | | Step 4) Start grpc client (on client with microphone). |
| | | ### Install the requirements |
| | | |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/grpc |
| | | pip install -r requirements_client.txt |
| | | ``` |
| | | # Optional, Install dependency. |
| | | python -m pip install pyaudio webrtcvad grpcio grpcio-tools |
| | | |
| | | ### Generate protobuf file |
| | | Run on server, the two generated pb files are both used for server and client |
| | | |
| | | ```shell |
| | | # paraformer_pb2.py and paraformer_pb2_grpc.py are already generated, |
| | | # regenerate it only when you make changes to ./proto/paraformer.proto file. |
| | | python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto |
| | | ``` |
| | | |
| | | ### Start grpc client |
| | | ``` |
| | | # Start client. |
| | | python grpc_main_client_mic.py --host 127.0.0.1 --port 10095 |
| | |
| | | |
| | | |
| | | ## Workflow in desgin |
| | |  |
| | | |
| | | <div align="left"><img src="proto/workflow.png" width="400"/> |
| | | |
| | | ## Reference |
| | | We borrow from or refer to some code as: |
| | |
| | | self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model) |
| | | elif self.backend == "onnxruntime": |
| | | try: |
| | | from rapid_paraformer.paraformer_onnx import Paraformer |
| | | from funasr_onnx import Paraformer |
| | | except ImportError: |
| | | raise ImportError(f"Please install onnxruntime environment") |
| | | self.inference_16k_pipeline = Paraformer(model_dir=onnx_dir) |
| New file |
| | |
| | | pyaudio |
| | | webrtcvad |
| | | grpcio |
| | | grpcio-tools |
| New file |
| | |
| | | grpcio |
| | | grpcio-tools |
| | |
| | | ## Using paraformer with libtorch |
| | | ## Using funasr with libtorch |
| | | |
| | | [FunASR](https://github.com/alibaba-damo-academy/FunASR) hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on ModelScope, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun! |
| | | |
| | | ### Introduction |
| | | - Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary). |
| | |
| | | ### Steps: |
| | | 1. Export the model. |
| | | - Command: (`Tips`: torch >= 1.11.0 is required.) |
| | | |
| | | ```shell |
| | | python -m funasr.export.export_model [model_name] [export_dir] false |
| | | ``` |
| | | `model_name`: the model is to export. |
| | | |
| | | `export_dir`: the dir where the onnx is export. |
| | | |
| | | More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)) |
| | | |
| | |
| | | ``` |
| | | |
| | | |
| | | 2. Install the `torch_paraformer`. |
| | | 2. Install the `funasr_torch`. |
| | | |
| | | install from pip |
| | | ```shell |
| | | pip install --upgrade funasr_torch -i https://pypi.Python.org/simple |
| | | ``` |
| | | or install from source code |
| | | |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/libtorch |
| | | cd funasr/runtime/python/funasr_torch |
| | | python setup.py build |
| | | python setup.py install |
| | | ``` |
| | | |
| | | |
| | | 3. Run the demo. |
| | | - Model_dir: the model path, which contains `model.torchscripts`, `config.yaml`, `am.mvn`. |
| | |
| | | - Output: `List[str]`: recognition result. |
| | | - Example: |
| | | ```python |
| | | from torch_paraformer import Paraformer |
| | | from funasr_torch import Paraformer |
| | | |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model = Paraformer(model_dir, batch_size=1) |
| | |
| | | | Onnx | 0.038 | |
| | | |
| | | ## Acknowledge |
| | | This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR). |
| | |
| | | |
| | | from torch_paraformer import Paraformer |
| | | from funasr_torch import Paraformer |
| | | |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model = Paraformer(model_dir, batch_size=1) |
| | |
| | | |
| | | |
| | | setuptools.setup( |
| | | name='torch_paraformer', |
| | | version='0.0.1', |
| | | name='funasr_torch', |
| | | version='0.0.3', |
| | | platforms="Any", |
| | | url="https://github.com/alibaba-damo-academy/FunASR.git", |
| | | author="Speech Lab, Alibaba Group, China", |
| | |
| | | "PyYAML>=5.1.2", "torch-quant >= 0.4.0"], |
| | | packages=find_packages(include=["torch_paraformer*"]), |
| | | keywords=[ |
| | | 'funasr,paraformer' |
| | | 'funasr,paraformer, funasr_torch' |
| | | ], |
| | | classifiers=[ |
| | | 'Programming Language :: Python :: 3.6', |
| | |
| | | ## Using paraformer with ONNXRuntime |
| | | ## Using funasr with ONNXRuntime |
| | | |
| | | <p align="left"> |
| | | <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a> |
| | | <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a> |
| | | </p> |
| | | |
| | | ### Introduction |
| | | - Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary). |
| | |
| | | ### Steps: |
| | | 1. Export the model. |
| | | - Command: (`Tips`: torch >= 1.11.0 is required.) |
| | | |
| | | ```shell |
| | | python -m funasr.export.export_model [model_name] [export_dir] [true] |
| | | ``` |
| | | `model_name`: the model is to export. |
| | | |
| | | `export_dir`: the dir where the onnx is export. |
| | | |
| | | More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)) |
| | | |
| | |
| | | ``` |
| | | |
| | | |
| | | 2. Install the `rapid_paraformer`. |
| | | - Build the rapid_paraformer `whl` |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/onnxruntime |
| | | python setup.py bdist_wheel |
| | | ``` |
| | | - Install the build `whl` |
| | | ```bash |
| | | pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl |
| | | ``` |
| | | 2. Install the `funasr_onnx` |
| | | |
| | | install from pip |
| | | ```shell |
| | | pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple |
| | | ``` |
| | | |
| | | or install from source code |
| | | |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/funasr_onnx |
| | | python setup.py build |
| | | python setup.py install |
| | | ``` |
| | | |
| | | 3. Run the demo. |
| | | - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`. |
| | |
| | | - Output: `List[str]`: recognition result. |
| | | - Example: |
| | | ```python |
| | | from rapid_paraformer import Paraformer |
| | | from funasr_onnx import Paraformer |
| | | |
| | | model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model = Paraformer(model_dir, batch_size=1) |
| | |
| | | |
| | | |
| | | ## Acknowledge |
| | | 1. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime(python api). |
| | | 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR). |
| | | 2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model). |
| | |
| | | |
| | | from rapid_paraformer import Paraformer |
| | | from funasr_onnx import Paraformer |
| | | |
| | | #model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | #model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
copy from funasr/runtime/python/libtorch/torch_paraformer/__init__.py
copy to funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
| File was renamed from funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py |
| | |
| | | # -*- encoding: utf-8 -*- |
| | | # @Author: SWHL |
| | | # @Contact: liekkaskono@163.com |
| | | from cgitb import text |
| | | |
| | | import os.path |
| | | from pathlib import Path |
| | | from typing import List, Union, Tuple |
| | |
| | | return readme |
| | | |
| | | |
| | | MODULE_NAME = 'rapid_paraformer' |
| | | VERSION_NUM = '0.0.1' |
| | | MODULE_NAME = 'funasr_onnx' |
| | | VERSION_NUM = '0.0.2' |
| | | |
| | | setuptools.setup( |
| | | name=MODULE_NAME, |
| | | version=VERSION_NUM, |
| | | platforms="Any", |
| | | description="Using paraformer with ONNXRuntime", |
| | | author="FunASR", |
| | | url="https://github.com/alibaba-damo-academy/FunASR.git", |
| | | author="Speech Lab, Alibaba Group, China", |
| | | author_email="funasr@list.alibaba-inc.com", |
| | | url="https://github.com/alibaba-damo-academy/FunASR", |
| | | description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit", |
| | | license='MIT', |
| | | long_description=get_readme(), |
| | | long_description_content_type='text/markdown', |
| | |
| | | args = parser.parse_args() |
| | | |
| | | |
| | | from funasr.runtime.python.libtorch.torch_paraformer import Paraformer |
| | | from funasr.runtime.python.libtorch.funasr_torch import Paraformer |
| | | if args.backend == "onnx": |
| | | from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer |
| | | from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer |
| | | |
| | | model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads) |
| | | |
| | |
| | | args = parser.parse_args() |
| | | |
| | | |
| | | from funasr.runtime.python.libtorch.torch_paraformer import Paraformer |
| | | from funasr.runtime.python.libtorch.funasr_torch import Paraformer |
| | | if args.backend == "onnx": |
| | | from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer |
| | | from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer |
| | | |
| | | model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads) |
| | | |
| | |
| | | We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking. |
| | | The audio data is in streaming, the asr inference process is in offline. |
| | | |
| | | # Steps |
| | | |
| | | ## For the Server |
| | | |