veelion
2023-03-24 7687f64729810ce0ffd0b5a38276ebbf75da43eb
Merge branch 'alibaba-damo-academy:main' into main
20个文件已修改
2个文件已添加
14 文件已重命名
2个文件已删除
1 文件已复制
484 ■■■■■ 已修改文件
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py 58 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_paraformer_streaming.py 196 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_uniasr.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_uniasr_vad.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/vad_inference_online.py 10 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/e2e_vad.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/grpc/Readme.md 90 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/grpc/grpc_server.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/grpc/requirements_client.txt 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/grpc/requirements_server.txt 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/README.md 26 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/demo.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/__init__.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/__init__.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/compute_wer.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/frontend.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/postprocess_utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/funasr_torch/utils/utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/libtorch/setup.py 6 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/README.md 44 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/debug.png 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/demo.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py 4 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/onnxruntime/setup.py 10 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/utils/infer.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/utils/test_rtf.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/runtime/python/websocket/README.md 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/version.txt 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -1,57 +1,37 @@
import os
import logging
import torch
import torchaudio
import soundfile
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
os.environ["MODELSCOPE_CACHE"] = "./"
inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
    model_revision='v1.0.2')
waveform, sample_rate = torchaudio.load("waihu.wav")
speech_length = waveform.shape[1]
speech = waveform[0]
model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
speech_length = speech.shape[0]
cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
cache_de = {"decode_fsmn": None}
cache = {"encoder": cache_en, "decoder": cache_de}
param_dict = {}
param_dict["cache"] = cache
first_chunk = True
speech_buffer = speech
speech_cache = []
sample_offset = 0
step = 4800  #300ms
param_dict = {"cache": dict(), "is_final": False}
final_result = ""
while len(speech_buffer) >= 960:
    if first_chunk:
        if len(speech_buffer) >= 14400:
            rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
            speech_buffer = speech_buffer[4800:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
        cache_en["start_idx"] = -5
        first_chunk = False
    else:
        cache_en["start_idx"] += 10
        if len(speech_buffer) >= 4800:
            cache_en["pad_left"] = 5
            rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
            speech_buffer = speech_buffer[9600:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
    if len(rec_result) !=0 and rec_result['text'] != "sil":
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
    if sample_offset + step >= speech_length - 1:
        step = speech_length - sample_offset
        param_dict["is_final"] = True
    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
                                    param_dict=param_dict)
    if len(rec_result) != 0 and rec_result['text'] != "sil" and rec_result['text'] != "waiting_for_more_voice":
        final_result += rec_result['text']
    print(rec_result)
print(final_result)
egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 160 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset
egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 80 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset
funasr/bin/asr_inference_paraformer_streaming.py
@@ -544,11 +544,6 @@
    )
    export_mode = False
    if param_dict is not None:
        hotword_list_or_file = param_dict.get('hotword')
        export_mode = param_dict.get("export_mode", False)
    else:
        hotword_list_or_file = None
    if ngpu >= 1 and torch.cuda.is_available():
        device = "cuda"
@@ -578,7 +573,6 @@
        ngram_weight=ngram_weight,
        penalty=penalty,
        nbest=nbest,
        hotword_list_or_file=hotword_list_or_file,
    )
    if export_mode:
        speech2text = Speech2TextExport(**speech2text_kwargs)
@@ -594,123 +588,92 @@
            **kwargs,
    ):
        hotword_list_or_file = None
        if param_dict is not None:
            hotword_list_or_file = param_dict.get('hotword')
        if 'hotword' in kwargs:
            hotword_list_or_file = kwargs['hotword']
        if hotword_list_or_file is not None or 'hotword' in kwargs:
            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
        # 3. Build data-iterator
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
        loader = ASRTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
            fs=fs,
            batch_size=batch_size,
            key_file=key_file,
            num_workers=num_workers,
            preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
            collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
            allow_variable_data_keys=allow_variable_data_keys,
            inference=True,
        )
            if isinstance(raw_inputs, np.ndarray):
                raw_inputs = torch.tensor(raw_inputs)
        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True
        forward_time_total = 0.0
        length_total = 0.0
        finish_count = 0
        file_count = 1
        cache = None
        is_final = False
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        if param_dict is not None and "is_final" in param_dict:
            is_final = param_dict["is_final"]
        # 7 .Start for-loop
        # FIXME(kamo): The output format should be discussed about
        asr_result_list = []
        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
        if output_path is not None:
            writer = DatadirWriter(output_path)
        results = []
        asr_result = ""
        wait = True
        if len(cache) == 0:
            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
            cache_de = {"decode_fsmn": None}
            cache["decoder"] = cache_de
            cache["first_chunk"] = True
            cache["speech"] = []
            cache["chunk_index"] = 0
            cache["speech_chunk"] = []
        if raw_inputs is not None:
            if len(cache["speech"]) == 0:
                cache["speech"] = raw_inputs
            else:
                cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
            if len(cache["speech_chunk"]) == 0:
                cache["speech_chunk"] = raw_inputs
            else:
                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
            while len(cache["speech_chunk"]) >= 960:
                if cache["first_chunk"]:
                    if len(cache["speech_chunk"]) >= 14400:
                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
                        speech_length = torch.tensor([14400])
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
                        cache["first_chunk"] = False
                        cache["encoder"]["start_idx"] = -5
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                            wait = False
                        else:
                            break
                else:
                    if len(cache["speech_chunk"]) >= 19200:
                        cache["encoder"]["start_idx"] += 10
                        cache["encoder"]["pad_left"] = 5
                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
                        speech_length = torch.tensor([19200])
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                            wait = False
                        else:
                            break
                if len(results) >= 1:
                    asr_result += results[0][0]
            if asr_result == "":
                asr_result = "sil"
            if wait:
                asr_result = "waiting_for_more_voice"
            item = {'key': "utt", 'value': asr_result}
            asr_result_list.append(item)
        else:
            writer = None
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
            logging.info("decoding, utt_id: {}".format(keys))
            # N-best list of (text, token, token_int, hyp_object)
            time_beg = time.time()
            results = speech2text(cache=cache, **batch)
            if len(results) < 1:
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
            time_end = time.time()
            forward_time = time_end - time_beg
            lfr_factor = results[0][-1]
            length = results[0][-2]
            forward_time_total += forward_time
            length_total += length
            rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
                                                                                               100 * forward_time / (
                                                                                                           length * lfr_factor))
            logging.info(rtf_cur)
            for batch_id in range(_bs):
                result = [results[batch_id][:-2]]
                key = keys[batch_id]
                for n, result in zip(range(1, nbest + 1), result):
                    text, token, token_int, hyp = result[0], result[1], result[2], result[3]
                    time_stamp = None if len(result) < 5 else result[4]
                    # Create a directory: outdir/{n}best_recog
                    if writer is not None:
                        ibest_writer = writer[f"{n}best_recog"]
                        # Write the result to each file
                        ibest_writer["token"][key] = " ".join(token)
                        # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                        ibest_writer["score"][key] = str(hyp.score)
                        ibest_writer["rtf"][key] = rtf_cur
                    if text is not None:
                        if use_timestamp and time_stamp is not None:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                        else:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token)
                        time_stamp_postprocessed = ""
                        if len(postprocessed_result) == 3:
                            text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                                       postprocessed_result[1], \
                                                                                       postprocessed_result[2]
                        else:
                            text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
                        item = {'key': key, 'value': text_postprocessed}
                        if time_stamp_postprocessed != "":
                            item['time_stamp'] = time_stamp_postprocessed
                        asr_result_list.append(item)
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
                            ibest_writer["text"][key] = text_postprocessed
                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
                                                                                                           forward_time_total,
                                                                                                           100 * forward_time_total / (
                                                                                                                       length_total * lfr_factor))
        logging.info(rtf_avg)
        if writer is not None:
            ibest_writer["rtf"]["rtf_avf"] = rtf_avg
            return []
        return asr_result_list
    return _forward
@@ -905,3 +868,4 @@
    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    # print(rec_result)
funasr/bin/asr_inference_uniasr.py
@@ -261,6 +261,7 @@
            # Change integer-ids to tokens
            token = self.converter.ids2tokens(token_int)
            token = list(filter(lambda x: x != "<gbg>", token))
            if self.tokenizer is not None:
                text = self.tokenizer.tokens2text(token)
@@ -512,7 +513,7 @@
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text
                        ibest_writer["text"][key] = text_postprocessed
        return asr_result_list
    
    return _forward
funasr/bin/asr_inference_uniasr_vad.py
@@ -261,6 +261,7 @@
            # Change integer-ids to tokens
            token = self.converter.ids2tokens(token_int)
            token = list(filter(lambda x: x != "<gbg>", token))
            if self.tokenizer is not None:
                text = self.tokenizer.tokens2text(token)
@@ -512,7 +513,7 @@
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text
                        ibest_writer["text"][key] = text_postprocessed
        return asr_result_list
    
    return _forward
funasr/bin/vad_inference_online.py
@@ -30,7 +30,8 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.bin.vad_inference import Speech2VadSegment
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2VadSegmentOnline(Speech2VadSegment):
@@ -55,7 +56,7 @@
    @torch.no_grad()
    def __call__(
            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
    ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
        """Inference
@@ -86,7 +87,8 @@
                "feats": feats,
                "waveform": waveforms,
                "in_cache": in_cache,
                "is_final": is_final
                "is_final": is_final,
                "max_end_sil": max_end_sil
            }
            # a. To device
            batch = to_device(batch, device=self.device)
@@ -217,6 +219,7 @@
        vad_results = []
        batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
        is_final = param_dict['is_final'] if param_dict is not None else False
        max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
@@ -224,6 +227,7 @@
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            batch['in_cache'] = batch_in_cache
            batch['is_final'] = is_final
            batch['max_end_sil'] = max_end_sil
            # do vad segment
            _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
funasr/models/e2e_vad.py
old mode 100755 new mode 100644
@@ -473,8 +473,9 @@
        return segments, in_cache
    def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
                is_final: bool = False
                is_final: bool = False, max_end_sil: int = 800
                ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
        self.waveform = waveform  # compute decibel for each frame
        self.ComputeDecibel()
        self.ComputeScores(feats, in_cache)
funasr/runtime/python/grpc/Readme.md
@@ -3,87 +3,81 @@
The audio data is in streaming, the asr inference process is in offline.
## Steps
Step 1-1) Prepare server modelscope pipeline environment (on server).
## For the Server
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Install modelscope and funasr with pip or with cuda-docker image.
### Prepare server environment
#### Backend is modelscope pipeline (default)
Install the modelscope and funasr
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Option 1: Install modelscope and funasr with [pip](https://github.com/alibaba-damo-academy/FunASR#installation)
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Option 2: or install with cuda-docker image as:
```
CID=`docker run --network host -d -it --gpus '"device=0"' registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.2.0`
echo $CID
docker exec -it $CID /bin/bash
```
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Get funasr source code and get into grpc directory.
```
git clone https://github.com/alibaba-damo-academy/FunASR
cd FunASR/funasr/runtime/python/grpc/
```
Step 1-2) Optional, Prepare server onnxruntime environment (on server).
Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
- Build the onnx_paraformer `whl`
```
```shell
pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/onnxruntime/rapid_paraformer
python setup.py build
python setup.py install
pip install --editable ./
```
[//]: # ()
[//]: # (- Install the build `whl`)
Install the requirements
[//]: # (```)
```shell
cd funasr/runtime/python/grpc
pip install -r requirements_server.txt
```
[//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl)
#### Backend is funasr_onnx (optional)
[//]: # (```)
Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
```
pip install funasr_onnx -i https://pypi.Python.org/simple
```
Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```
Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client).
```
# Optional, Install dependency.
python -m pip install grpcio grpcio-tools
```
### Generate protobuf file
Run on server, the two generated pb files are both used for server and client
```
```shell
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated, 
# regenerate it only when you make changes to ./proto/paraformer.proto file.
python -m grpc_tools.protoc  --proto_path=./proto -I ./proto    --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
```
Step 3) Start grpc server (on server).
```
# Optional, Install dependency.
python -m pip install grpcio grpcio-tools
```
### Start grpc server
```
# Start server.
python grpc_main_server.py --port 10095 --backend pipeline
```
If you want run server with onnxruntime, please set `backend` and `onnx_dir` paramater.
If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
```
# Start server.
python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
```
## For the client
Step 4) Start grpc client (on client with microphone).
### Install the requirements
```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/grpc
pip install -r requirements_client.txt
```
# Optional, Install dependency.
python -m pip install pyaudio webrtcvad grpcio grpcio-tools
### Generate protobuf file
Run on server, the two generated pb files are both used for server and client
```shell
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated,
# regenerate it only when you make changes to ./proto/paraformer.proto file.
python -m grpc_tools.protoc  --proto_path=./proto -I ./proto    --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
```
### Start grpc client
```
# Start client.
python grpc_main_client_mic.py --host 127.0.0.1 --port 10095
@@ -91,8 +85,8 @@
## Workflow in desgin
![avatar](proto/workflow.png)
<div align="left"><img src="proto/workflow.png" width="400"/>
## Reference
We borrow from or refer to some code as:
funasr/runtime/python/grpc/grpc_server.py
@@ -24,7 +24,7 @@
            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
        elif self.backend == "onnxruntime":
            try:
                from rapid_paraformer.paraformer_onnx import Paraformer
                from funasr_onnx import Paraformer
            except ImportError:
                raise ImportError(f"Please install onnxruntime environment")
            self.inference_16k_pipeline = Paraformer(model_dir=onnx_dir)
funasr/runtime/python/grpc/requirements_client.txt
New file
@@ -0,0 +1,4 @@
pyaudio
webrtcvad
grpcio
grpcio-tools
funasr/runtime/python/grpc/requirements_server.txt
New file
@@ -0,0 +1,2 @@
grpcio
grpcio-tools
funasr/runtime/python/libtorch/README.md
@@ -1,5 +1,6 @@
## Using paraformer with libtorch
## Using funasr with libtorch
[FunASR](https://github.com/alibaba-damo-academy/FunASR) hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on ModelScope, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun!
### Introduction
- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
@@ -7,13 +8,6 @@
### Steps:
1. Export the model.
   - Command: (`Tips`: torch >= 1.11.0 is required.)
      ```shell
      python -m funasr.export.export_model [model_name] [export_dir] false
      ```
      `model_name`: the model is to export.
      `export_dir`: the dir where the onnx is export.
       More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))
@@ -27,13 +21,20 @@
         ```
2. Install the `torch_paraformer`.
2. Install the `funasr_torch`.
    install from pip
    ```shell
    pip install --upgrade funasr_torch -i https://pypi.Python.org/simple
    ```
    or install from source code
    ```shell
    git clone https://github.com/alibaba/FunASR.git && cd FunASR
    cd funasr/runtime/python/libtorch
    cd funasr/runtime/python/funasr_torch
    python setup.py build
    python setup.py install
    ```
3. Run the demo.
   - Model_dir: the model path, which contains `model.torchscripts`, `config.yaml`, `am.mvn`.
@@ -41,7 +42,7 @@
   - Output: `List[str]`: recognition result.
   - Example:
        ```python
        from torch_paraformer import Paraformer
        from funasr_torch import Paraformer
        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        model = Paraformer(model_dir, batch_size=1)
@@ -65,3 +66,4 @@
|   Onnx   |   0.038    |
## Acknowledge
This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
funasr/runtime/python/libtorch/demo.py
@@ -1,5 +1,5 @@
from torch_paraformer import Paraformer
from funasr_torch import Paraformer
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1)
funasr/runtime/python/libtorch/funasr_torch/__init__.py
funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py
funasr/runtime/python/libtorch/funasr_torch/utils/__init__.py
funasr/runtime/python/libtorch/funasr_torch/utils/compute_wer.py
funasr/runtime/python/libtorch/funasr_torch/utils/frontend.py
funasr/runtime/python/libtorch/funasr_torch/utils/postprocess_utils.py
funasr/runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py
funasr/runtime/python/libtorch/funasr_torch/utils/utils.py
funasr/runtime/python/libtorch/setup.py
@@ -14,8 +14,8 @@
setuptools.setup(
    name='torch_paraformer',
    version='0.0.1',
    name='funasr_torch',
    version='0.0.3',
    platforms="Any",
    url="https://github.com/alibaba-damo-academy/FunASR.git",
    author="Speech Lab, Alibaba Group, China",
@@ -31,7 +31,7 @@
                      "PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
    packages=find_packages(include=["torch_paraformer*"]),
    keywords=[
        'funasr,paraformer'
        'funasr,paraformer, funasr_torch'
    ],
    classifiers=[
        'Programming Language :: Python :: 3.6',
funasr/runtime/python/onnxruntime/README.md
@@ -1,9 +1,5 @@
## Using paraformer with ONNXRuntime
## Using funasr with ONNXRuntime
<p align="left">
    <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
</p>
### Introduction
- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
@@ -12,13 +8,6 @@
### Steps:
1. Export the model.
   - Command: (`Tips`: torch >= 1.11.0 is required.)
      ```shell
      python -m funasr.export.export_model [model_name] [export_dir] [true]
      ```
      `model_name`: the model is to export.
      `export_dir`: the dir where the onnx is export.
       More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))
@@ -32,17 +21,21 @@
         ```
2. Install the `rapid_paraformer`.
   - Build the rapid_paraformer `whl`
     ```shell
     git clone https://github.com/alibaba/FunASR.git && cd FunASR
     cd funasr/runtime/python/onnxruntime
     python setup.py bdist_wheel
     ```
   - Install the build `whl`
     ```bash
     pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl
     ```
2. Install the `funasr_onnx`
install from pip
```shell
pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
```
or install from source code
```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_onnx
python setup.py build
python setup.py install
```
3. Run the demo.
   - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
@@ -50,7 +43,7 @@
   - Output: `List[str]`: recognition result.
   - Example:
        ```python
        from rapid_paraformer import Paraformer
        from funasr_onnx import Paraformer
        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        model = Paraformer(model_dir, batch_size=1)
@@ -74,4 +67,5 @@
## Acknowledge
1. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime(python api).
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).
funasr/runtime/python/onnxruntime/debug.png
Binary files differ
funasr/runtime/python/onnxruntime/demo.py
@@ -1,5 +1,5 @@
from rapid_paraformer import Paraformer
from funasr_onnx import Paraformer
#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
copy from funasr/runtime/python/libtorch/torch_paraformer/__init__.py copy to funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
File was renamed from funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
@@ -1,7 +1,5 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
from cgitb import text
import os.path
from pathlib import Path
from typing import List, Union, Tuple
funasr/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py
funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
funasr/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
funasr/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py
File was deleted
funasr/runtime/python/onnxruntime/setup.py
@@ -12,17 +12,17 @@
    return readme
MODULE_NAME = 'rapid_paraformer'
VERSION_NUM = '0.0.1'
MODULE_NAME = 'funasr_onnx'
VERSION_NUM = '0.0.2'
setuptools.setup(
    name=MODULE_NAME,
    version=VERSION_NUM,
    platforms="Any",
    description="Using paraformer with ONNXRuntime",
    author="FunASR",
    url="https://github.com/alibaba-damo-academy/FunASR.git",
    author="Speech Lab, Alibaba Group, China",
    author_email="funasr@list.alibaba-inc.com",
    url="https://github.com/alibaba-damo-academy/FunASR",
    description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
    license='MIT',
    long_description=get_readme(),
    long_description_content_type='text/markdown',
funasr/runtime/python/utils/infer.py
@@ -15,9 +15,9 @@
args = parser.parse_args()
from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
from funasr.runtime.python.libtorch.funasr_torch import Paraformer
if args.backend == "onnx":
    from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
    from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer
    
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
funasr/runtime/python/utils/test_rtf.py
@@ -14,9 +14,9 @@
args = parser.parse_args()
from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
from funasr.runtime.python.libtorch.funasr_torch import Paraformer
if args.backend == "onnx":
    from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
    from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer
    
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
funasr/runtime/python/websocket/README.md
@@ -2,7 +2,6 @@
We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
The audio data is in streaming, the asr inference process is in offline.
# Steps
## For the Server
funasr/version.txt
@@ -1 +1 @@
0.3.0
0.3.1