python/FunASR-XL.git

parent: 1b8419d8 | 补丁 | 提交 | ignore whitespace

Merge branch 'alibaba-damo-academy:main' into main

veelion

2023-03-24 7687f64729810ce0ffd0b5a38276ebbf75da43eb

Merge branch 'alibaba-damo-academy:main' into main

20个文件已修改

2个文件已删除

2个文件已添加

1 文件已复制

14 文件已重命名

	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py	58 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_streaming.py	196 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/vad_inference_online.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_vad.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/Readme.md	90 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/grpc_server.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/requirements_client.txt	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/requirements_server.txt	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/README.md	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/demo.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/compute_wer.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/frontend.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/postprocess_utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/funasr_torch/utils/utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/setup.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/README.md	44 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/debug.png	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/setup.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/infer.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/test_rtf.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/websocket/README.md	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/version.txt	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py

@@ -1,57 +1,37 @@
import os
import logging
import torch
import torchaudio
import soundfile

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

from modelscope.utils.logger import get_logger
import logging

logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)

os.environ["MODELSCOPE_CACHE"] = "./"
inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
    model_revision='v1.0.2')

waveform, sample_rate = torchaudio.load("waihu.wav")
speech_length = waveform.shape[1]
speech = waveform[0]
model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
speech_length = speech.shape[0]

cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
cache_de = {"decode_fsmn": None}
cache = {"encoder": cache_en, "decoder": cache_de}
param_dict = {}
param_dict["cache"] = cache

first_chunk = True
speech_buffer = speech
speech_cache = []
sample_offset = 0
step = 4800  #300ms
param_dict = {"cache": dict(), "is_final": False}
final_result = ""

while len(speech_buffer) >= 960:
    if first_chunk:
        if len(speech_buffer) >= 14400:
            rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
            speech_buffer = speech_buffer[4800:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
        cache_en["start_idx"] = -5
        first_chunk = False
    else:
        cache_en["start_idx"] += 10
        if len(speech_buffer) >= 4800:
            cache_en["pad_left"] = 5
            rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
            speech_buffer = speech_buffer[9600:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960 
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
    if len(rec_result) !=0 and rec_result['text'] != "sil":
for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
    if sample_offset + step >= speech_length - 1:
        step = speech_length - sample_offset
        param_dict["is_final"] = True
    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
                                    param_dict=param_dict)
    if len(rec_result) != 0 and rec_result['text'] != "sil" and rec_result['text'] != "waiting_for_more_voice":
        final_result += rec_result['text']
    print(rec_result)
print(final_result)

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py

@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 160 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py

@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 80 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset

 funasr/bin/asr_inference_paraformer_streaming.py

@@ -544,11 +544,6 @@
    )

    export_mode = False
    if param_dict is not None:
        hotword_list_or_file = param_dict.get('hotword')
        export_mode = param_dict.get("export_mode", False)
    else:
        hotword_list_or_file = None

    if ngpu >= 1 and torch.cuda.is_available():
        device = "cuda"
@@ -578,7 +573,6 @@
        ngram_weight=ngram_weight,
        penalty=penalty,
        nbest=nbest,
        hotword_list_or_file=hotword_list_or_file,
    )
    if export_mode:
        speech2text = Speech2TextExport(**speech2text_kwargs)
@@ -594,123 +588,92 @@
            **kwargs,
    ):

        hotword_list_or_file = None
        if param_dict is not None:
            hotword_list_or_file = param_dict.get('hotword')
        if 'hotword' in kwargs:
            hotword_list_or_file = kwargs['hotword']
        if hotword_list_or_file is not None or 'hotword' in kwargs:
            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)

        # 3. Build data-iterator
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
        loader = ASRTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
            fs=fs,
            batch_size=batch_size,
            key_file=key_file,
            num_workers=num_workers,
            preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
            collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
            allow_variable_data_keys=allow_variable_data_keys,
            inference=True,
        )
            if isinstance(raw_inputs, np.ndarray):
                raw_inputs = torch.tensor(raw_inputs)

        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True

        forward_time_total = 0.0
        length_total = 0.0
        finish_count = 0
        file_count = 1
        cache = None
        is_final = False
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        if param_dict is not None and "is_final" in param_dict:
            is_final = param_dict["is_final"]
        # 7 .Start for-loop
        # FIXME(kamo): The output format should be discussed about
        asr_result_list = []
        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
        if output_path is not None:
            writer = DatadirWriter(output_path)
        results = []
        asr_result = ""
        wait = True
        if len(cache) == 0:
            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
            cache_de = {"decode_fsmn": None}
            cache["decoder"] = cache_de
            cache["first_chunk"] = True
            cache["speech"] = []
            cache["chunk_index"] = 0
            cache["speech_chunk"] = []

        if raw_inputs is not None:
            if len(cache["speech"]) == 0:
                cache["speech"] = raw_inputs
            else:
                cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
            if len(cache["speech_chunk"]) == 0:
                cache["speech_chunk"] = raw_inputs
            else:
                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
            while len(cache["speech_chunk"]) >= 960:
                if cache["first_chunk"]:
                    if len(cache["speech_chunk"]) >= 14400:
                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
                        speech_length = torch.tensor([14400])
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
                        cache["first_chunk"] = False
                        cache["encoder"]["start_idx"] = -5
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                            wait = False
                        else:
                            break
                else:
                    if len(cache["speech_chunk"]) >= 19200:
                        cache["encoder"]["start_idx"] += 10
                        cache["encoder"]["pad_left"] = 5
                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
                        speech_length = torch.tensor([19200])
                        results = speech2text(cache, speech, speech_length)
                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
                        wait = False
                    else:
                        if is_final:
                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
                            cache["encoder"]["pad_right"] = 0
                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
                            speech_length = torch.tensor([len(cache["speech_chunk"])])
                            results = speech2text(cache, speech, speech_length)
                            cache["speech_chunk"] = []
                            wait = False
                        else:
                            break
                
                if len(results) >= 1:
                    asr_result += results[0][0]
            if asr_result == "":
                asr_result = "sil"
            if wait:
                asr_result = "waiting_for_more_voice"
            item = {'key': "utt", 'value': asr_result}
            asr_result_list.append(item)
        else:
            writer = None
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
            logging.info("decoding, utt_id: {}".format(keys))
            # N-best list of (text, token, token_int, hyp_object)

            time_beg = time.time()
            results = speech2text(cache=cache, **batch)
            if len(results) < 1:
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
            time_end = time.time()
            forward_time = time_end - time_beg
            lfr_factor = results[0][-1]
            length = results[0][-2]
            forward_time_total += forward_time
            length_total += length
            rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
                                                                                               100 * forward_time / (
                                                                                                           length * lfr_factor))
            logging.info(rtf_cur)

            for batch_id in range(_bs):
                result = [results[batch_id][:-2]]

                key = keys[batch_id]
                for n, result in zip(range(1, nbest + 1), result):
                    text, token, token_int, hyp = result[0], result[1], result[2], result[3]
                    time_stamp = None if len(result) < 5 else result[4]
                    # Create a directory: outdir/{n}best_recog
                    if writer is not None:
                        ibest_writer = writer[f"{n}best_recog"]

                        # Write the result to each file
                        ibest_writer["token"][key] = " ".join(token)
                        # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                        ibest_writer["score"][key] = str(hyp.score)
                        ibest_writer["rtf"][key] = rtf_cur

                    if text is not None:
                        if use_timestamp and time_stamp is not None:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                        else:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token)
                        time_stamp_postprocessed = ""
                        if len(postprocessed_result) == 3:
                            text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                                       postprocessed_result[1], \
                                                                                       postprocessed_result[2]
                        else:
                            text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
                        item = {'key': key, 'value': text_postprocessed}
                        if time_stamp_postprocessed != "":
                            item['time_stamp'] = time_stamp_postprocessed
                        asr_result_list.append(item)
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
                            ibest_writer["text"][key] = text_postprocessed

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
                                                                                                           forward_time_total,
                                                                                                           100 * forward_time_total / (
                                                                                                                       length_total * lfr_factor))
        logging.info(rtf_avg)
        if writer is not None:
            ibest_writer["rtf"]["rtf_avf"] = rtf_avg
            return []
        return asr_result_list

    return _forward
@@ -905,3 +868,4 @@
    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    # print(rec_result)



 funasr/bin/asr_inference_uniasr.py

@@ -261,6 +261,7 @@

            # Change integer-ids to tokens
            token = self.converter.ids2tokens(token_int)
            token = list(filter(lambda x: x != "<gbg>", token))

            if self.tokenizer is not None:
                text = self.tokenizer.tokens2text(token)
@@ -512,7 +513,7 @@
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text
                        ibest_writer["text"][key] = text_postprocessed
        return asr_result_list
    
    return _forward

 funasr/bin/asr_inference_uniasr_vad.py

@@ -261,6 +261,7 @@

            # Change integer-ids to tokens
            token = self.converter.ids2tokens(token_int)
            token = list(filter(lambda x: x != "<gbg>", token))

            if self.tokenizer is not None:
                text = self.tokenizer.tokens2text(token)
@@ -512,7 +513,7 @@
                    finish_count += 1
                    asr_utils.print_progress(finish_count / file_count)
                    if writer is not None:
                        ibest_writer["text"][key] = text
                        ibest_writer["text"][key] = text_postprocessed
        return asr_result_list
    
    return _forward

 funasr/bin/vad_inference_online.py

@@ -30,7 +30,8 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.bin.vad_inference import Speech2VadSegment


header_colors = '\033[95m'
end_colors = '\033[0m'


class Speech2VadSegmentOnline(Speech2VadSegment):
@@ -55,7 +56,7 @@
    @torch.no_grad()
    def __call__(
            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
    ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
        """Inference

@@ -86,7 +87,8 @@
                "feats": feats,
                "waveform": waveforms,
                "in_cache": in_cache,
                "is_final": is_final
                "is_final": is_final,
                "max_end_sil": max_end_sil
            }
            # a. To device
            batch = to_device(batch, device=self.device)
@@ -217,6 +219,7 @@
        vad_results = []
        batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
        is_final = param_dict['is_final'] if param_dict is not None else False
        max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
@@ -224,6 +227,7 @@
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            batch['in_cache'] = batch_in_cache
            batch['is_final'] = is_final
            batch['max_end_sil'] = max_end_sil

            # do vad segment
            _, results, param_dict['in_cache'] = speech2vadsegment(**batch)

 funasr/models/e2e_vad.py

old mode 100755
new mode 100644
@@ -473,8 +473,9 @@
        return segments, in_cache

    def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
                is_final: bool = False
                is_final: bool = False, max_end_sil: int = 800
                ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
        self.waveform = waveform  # compute decibel for each frame
        self.ComputeDecibel()
        self.ComputeScores(feats, in_cache)

 funasr/runtime/python/grpc/Readme.md

@@ -3,87 +3,81 @@
The audio data is in streaming, the asr inference process is in offline.


## Steps

Step 1-1) Prepare server modelscope pipeline environment (on server).  
## For the Server

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Install modelscope and funasr with pip or with cuda-docker image.
### Prepare server environment
#### Backend is modelscope pipeline (default)
Install the modelscope and funasr

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Option 1: Install modelscope and funasr with [pip](https://github.com/alibaba-damo-academy/FunASR#installation)

&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Option 2: or install with cuda-docker image as: 

```
CID=`docker run --network host -d -it --gpus '"device=0"' registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.2.0`
echo $CID
docker exec -it $CID /bin/bash
```
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Get funasr source code and get into grpc directory.
```
git clone https://github.com/alibaba-damo-academy/FunASR
cd FunASR/funasr/runtime/python/grpc/
```

Step 1-2) Optional, Prepare server onnxruntime environment (on server). 

Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

- Build the onnx_paraformer `whl`
```
```shell
pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/onnxruntime/rapid_paraformer
python setup.py build
python setup.py install
pip install --editable ./
```

[//]: # ()
[//]: # (- Install the build `whl`)
Install the requirements

[//]: # (```)
```shell
cd funasr/runtime/python/grpc
pip install -r requirements_server.txt
```

[//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl)
#### Backend is funasr_onnx (optional)

[//]: # (```)
Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

```
pip install funasr_onnx -i https://pypi.Python.org/simple
```

Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```

Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client).
```
# Optional, Install dependency.
python -m pip install grpcio grpcio-tools
```
### Generate protobuf file
Run on server, the two generated pb files are both used for server and client

```
```shell
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated, 
# regenerate it only when you make changes to ./proto/paraformer.proto file.
python -m grpc_tools.protoc  --proto_path=./proto -I ./proto    --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
```

Step 3) Start grpc server (on server).
```
# Optional, Install dependency.
python -m pip install grpcio grpcio-tools
```
### Start grpc server

```
# Start server.
python grpc_main_server.py --port 10095 --backend pipeline
```

If you want run server with onnxruntime, please set `backend` and `onnx_dir` paramater.
If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
```
# Start server.
python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
```

## For the client

Step 4) Start grpc client (on client with microphone).
### Install the requirements

```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/grpc
pip install -r requirements_client.txt
```
# Optional, Install dependency.
python -m pip install pyaudio webrtcvad grpcio grpcio-tools

### Generate protobuf file
Run on server, the two generated pb files are both used for server and client

```shell
# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated, 
# regenerate it only when you make changes to ./proto/paraformer.proto file.
python -m grpc_tools.protoc  --proto_path=./proto -I ./proto    --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
```

### Start grpc client
```
# Start client.
python grpc_main_client_mic.py --host 127.0.0.1 --port 10095
@@ -91,8 +85,8 @@


## Workflow in desgin
![avatar](proto/workflow.png)

<div align="left"><img src="proto/workflow.png" width="400"/>

## Reference
We borrow from or refer to some code as:

 funasr/runtime/python/grpc/grpc_server.py

@@ -24,7 +24,7 @@
            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
        elif self.backend == "onnxruntime":
            try:
                from rapid_paraformer.paraformer_onnx import Paraformer
                from funasr_onnx import Paraformer
            except ImportError:
                raise ImportError(f"Please install onnxruntime environment")
            self.inference_16k_pipeline = Paraformer(model_dir=onnx_dir)

 funasr/runtime/python/grpc/requirements_client.txt

New file
@@ -0,0 +1,4 @@
pyaudio
webrtcvad
grpcio
grpcio-tools

 funasr/runtime/python/grpc/requirements_server.txt

New file
@@ -0,0 +1,2 @@
grpcio
grpcio-tools

 funasr/runtime/python/libtorch/README.md

@@ -1,5 +1,6 @@
## Using paraformer with libtorch
## Using funasr with libtorch

[FunASR](https://github.com/alibaba-damo-academy/FunASR) hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on ModelScope, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！

### Introduction
- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
@@ -7,13 +8,6 @@
### Steps:
1. Export the model.
   - Command: (`Tips`: torch >= 1.11.0 is required.)

      ```shell
      python -m funasr.export.export_model [model_name] [export_dir] false
      ```
      `model_name`: the model is to export.

      `export_dir`: the dir where the onnx is export.

       More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

@@ -27,13 +21,20 @@
         ```


2. Install the `torch_paraformer`.
2. Install the `funasr_torch`.
    
    install from pip
    ```shell
    pip install --upgrade funasr_torch -i https://pypi.Python.org/simple
    ```
    or install from source code

    ```shell
    git clone https://github.com/alibaba/FunASR.git && cd FunASR
    cd funasr/runtime/python/libtorch
    cd funasr/runtime/python/funasr_torch
    python setup.py build
    python setup.py install
    ```


3. Run the demo.
   - Model_dir: the model path, which contains `model.torchscripts`, `config.yaml`, `am.mvn`.
@@ -41,7 +42,7 @@
   - Output: `List[str]`: recognition result.
   - Example:
        ```python
        from torch_paraformer import Paraformer
        from funasr_torch import Paraformer

        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        model = Paraformer(model_dir, batch_size=1)
@@ -65,3 +66,4 @@
|   Onnx   |   0.038    |

## Acknowledge
This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).

 funasr/runtime/python/libtorch/demo.py

@@ -1,5 +1,5 @@

from torch_paraformer import Paraformer
from funasr_torch import Paraformer

model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1)

 funasr/runtime/python/libtorch/funasr_torch/__init__.py


 funasr/runtime/python/libtorch/funasr_torch/paraformer_bin.py


 funasr/runtime/python/libtorch/funasr_torch/utils/__init__.py


 funasr/runtime/python/libtorch/funasr_torch/utils/compute_wer.py


 funasr/runtime/python/libtorch/funasr_torch/utils/frontend.py


 funasr/runtime/python/libtorch/funasr_torch/utils/postprocess_utils.py


 funasr/runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py


 funasr/runtime/python/libtorch/funasr_torch/utils/utils.py


 funasr/runtime/python/libtorch/setup.py

@@ -14,8 +14,8 @@


setuptools.setup(
    name='torch_paraformer',
    version='0.0.1',
    name='funasr_torch',
    version='0.0.3',
    platforms="Any",
    url="https://github.com/alibaba-damo-academy/FunASR.git",
    author="Speech Lab, Alibaba Group, China",
@@ -31,7 +31,7 @@
                      "PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
    packages=find_packages(include=["torch_paraformer*"]),
    keywords=[
        'funasr,paraformer'
        'funasr,paraformer, funasr_torch'
    ],
    classifiers=[
        'Programming Language :: Python :: 3.6',

 funasr/runtime/python/onnxruntime/README.md

@@ -1,9 +1,5 @@
## Using paraformer with ONNXRuntime
## Using funasr with ONNXRuntime

<p align="left">
    <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
</p>

### Introduction
- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
@@ -12,13 +8,6 @@
### Steps:
1. Export the model.
   - Command: (`Tips`: torch >= 1.11.0 is required.)

      ```shell
      python -m funasr.export.export_model [model_name] [export_dir] [true]
      ```
      `model_name`: the model is to export.

      `export_dir`: the dir where the onnx is export.

       More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

@@ -32,17 +21,21 @@
         ```


2. Install the `rapid_paraformer`.
   - Build the rapid_paraformer `whl`
     ```shell
     git clone https://github.com/alibaba/FunASR.git && cd FunASR
     cd funasr/runtime/python/onnxruntime
     python setup.py bdist_wheel
     ```
   - Install the build `whl`
     ```bash
     pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl
     ```
2. Install the `funasr_onnx`

install from pip
```shell
pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
```

or install from source code

```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_onnx
python setup.py build
python setup.py install
```

3. Run the demo.
   - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
@@ -50,7 +43,7 @@
   - Output: `List[str]`: recognition result.
   - Example:
        ```python
        from rapid_paraformer import Paraformer
        from funasr_onnx import Paraformer

        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        model = Paraformer(model_dir, batch_size=1)
@@ -74,4 +67,5 @@


## Acknowledge
1. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime(python api).
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).

 funasr/runtime/python/onnxruntime/debug.png

Binary files differ

 funasr/runtime/python/onnxruntime/demo.py

@@ -1,5 +1,5 @@

from rapid_paraformer import Paraformer
from funasr_onnx import Paraformer

#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"

 funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py

copy from funasr/runtime/python/libtorch/torch_paraformer/__init__.py
copy to funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py

 funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py

File was renamed from funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
@@ -1,7 +1,5 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
from cgitb import text

import os.path
from pathlib import Path
from typing import List, Union, Tuple

 funasr/runtime/python/onnxruntime/funasr_onnx/utils/__init__.py


 funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py


 funasr/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py


 funasr/runtime/python/onnxruntime/funasr_onnx/utils/timestamp_utils.py


 funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py


 funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py

File was deleted

 funasr/runtime/python/onnxruntime/setup.py

@@ -12,17 +12,17 @@
    return readme


MODULE_NAME = 'rapid_paraformer'
VERSION_NUM = '0.0.1'
MODULE_NAME = 'funasr_onnx'
VERSION_NUM = '0.0.2'

setuptools.setup(
    name=MODULE_NAME,
    version=VERSION_NUM,
    platforms="Any",
    description="Using paraformer with ONNXRuntime",
    author="FunASR",
    url="https://github.com/alibaba-damo-academy/FunASR.git",
    author="Speech Lab, Alibaba Group, China",
    author_email="funasr@list.alibaba-inc.com",
    url="https://github.com/alibaba-damo-academy/FunASR",
    description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
    license='MIT',
    long_description=get_readme(),
    long_description_content_type='text/markdown',

 funasr/runtime/python/utils/infer.py

@@ -15,9 +15,9 @@
args = parser.parse_args()


from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
from funasr.runtime.python.libtorch.funasr_torch import Paraformer
if args.backend == "onnx":
    from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
    from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer
    
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)


 funasr/runtime/python/utils/test_rtf.py

@@ -14,9 +14,9 @@
args = parser.parse_args()


from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
from funasr.runtime.python.libtorch.funasr_torch import Paraformer
if args.backend == "onnx":
    from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
    from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer
    
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)


 funasr/runtime/python/websocket/README.md

@@ -2,7 +2,6 @@
We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
The audio data is in streaming, the asr inference process is in offline.

# Steps

## For the Server


 funasr/version.txt

@@ -1 +1 @@
0.3.0
0.3.1

			@@ -1,57 +1,37 @@
			import os
			import logging
			import torch
			import torchaudio
			import soundfile

			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			from modelscope.utils.logger import get_logger
			import logging

			logger = get_logger(log_level=logging.CRITICAL)
			logger.setLevel(logging.CRITICAL)

			os.environ["MODELSCOPE_CACHE"] = "./"
			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
			model_revision='v1.0.2')

			waveform, sample_rate = torchaudio.load("waihu.wav")
			speech_length = waveform.shape[1]
			speech = waveform[0]
			model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
			speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
			speech_length = speech.shape[0]

			cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
			cache_de = {"decode_fsmn": None}
			cache = {"encoder": cache_en, "decoder": cache_de}
			param_dict = {}
			param_dict["cache"] = cache

			first_chunk = True
			speech_buffer = speech
			speech_cache = []
			sample_offset = 0
			step = 4800 #300ms
			param_dict = {"cache": dict(), "is_final": False}
			final_result = ""

			while len(speech_buffer) >= 960:
			if first_chunk:
			if len(speech_buffer) >= 14400:
			rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
			speech_buffer = speech_buffer[4800:]
			else:
			cache_en["stride"] = len(speech_buffer) // 960
			cache_en["pad_right"] = 0
			rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
			speech_buffer = []
			cache_en["start_idx"] = -5
			first_chunk = False
			else:
			cache_en["start_idx"] += 10
			if len(speech_buffer) >= 4800:
			cache_en["pad_left"] = 5
			rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
			speech_buffer = speech_buffer[9600:]
			else:
			cache_en["stride"] = len(speech_buffer) // 960
			cache_en["pad_right"] = 0
			rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
			speech_buffer = []
			if len(rec_result) !=0 and rec_result['text'] != "sil":
			for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			if sample_offset + step >= speech_length - 1:
			step = speech_length - sample_offset
			param_dict["is_final"] = True
			rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
			param_dict=param_dict)
			if len(rec_result) != 0 and rec_result['text'] != "sil" and rec_result['text'] != "waiting_for_more_voice":
			final_result += rec_result['text']
			print(rec_result)
			print(final_result)

			@@ -22,7 +22,7 @@
			sample_offset = 0

			step = 160 * 10
			param_dict = {'in_cache': dict()}
			param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			if sample_offset + step >= speech_length - 1:
			step = speech_length - sample_offset

			@@ -22,7 +22,7 @@
			sample_offset = 0

			step = 80 * 10
			param_dict = {'in_cache': dict()}
			param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			if sample_offset + step >= speech_length - 1:
			step = speech_length - sample_offset

			@@ -544,11 +544,6 @@
			)

			export_mode = False
			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			export_mode = param_dict.get("export_mode", False)
			else:
			hotword_list_or_file = None

			if ngpu >= 1 and torch.cuda.is_available():
			device = "cuda"
			@@ -578,7 +573,6 @@
			ngram_weight=ngram_weight,
			penalty=penalty,
			nbest=nbest,
			hotword_list_or_file=hotword_list_or_file,
			)
			if export_mode:
			speech2text = Speech2TextExport(**speech2text_kwargs)
			@@ -594,123 +588,92 @@
			**kwargs,
			):

			hotword_list_or_file = None
			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			if 'hotword' in kwargs:
			hotword_list_or_file = kwargs['hotword']
			if hotword_list_or_file is not None or 'hotword' in kwargs:
			speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)

			# 3. Build data-iterator
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, torch.Tensor):
			raw_inputs = raw_inputs.numpy()
			data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
			loader = ASRTask.build_streaming_iterator(
			data_path_and_name_and_type,
			dtype=dtype,
			fs=fs,
			batch_size=batch_size,
			key_file=key_file,
			num_workers=num_workers,
			preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
			collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
			allow_variable_data_keys=allow_variable_data_keys,
			inference=True,
			)
			if isinstance(raw_inputs, np.ndarray):
			raw_inputs = torch.tensor(raw_inputs)

			if param_dict is not None:
			use_timestamp = param_dict.get('use_timestamp', True)
			else:
			use_timestamp = True

			forward_time_total = 0.0
			length_total = 0.0
			finish_count = 0
			file_count = 1
			cache = None
			is_final = False
			if param_dict is not None and "cache" in param_dict:
			cache = param_dict["cache"]
			if param_dict is not None and "is_final" in param_dict:
			is_final = param_dict["is_final"]
			# 7 .Start for-loop
			# FIXME(kamo): The output format should be discussed about
			asr_result_list = []
			output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
			if output_path is not None:
			writer = DatadirWriter(output_path)
			results = []
			asr_result = ""
			wait = True
			if len(cache) == 0:
			cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
			cache_de = {"decode_fsmn": None}
			cache["decoder"] = cache_de
			cache["first_chunk"] = True
			cache["speech"] = []
			cache["chunk_index"] = 0
			cache["speech_chunk"] = []

			if raw_inputs is not None:
			if len(cache["speech"]) == 0:
			cache["speech"] = raw_inputs
			else:
			cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
			if len(cache["speech_chunk"]) == 0:
			cache["speech_chunk"] = raw_inputs
			else:
			cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
			while len(cache["speech_chunk"]) >= 960:
			if cache["first_chunk"]:
			if len(cache["speech_chunk"]) >= 14400:
			speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
			speech_length = torch.tensor([14400])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"]= cache["speech_chunk"][4800:]
			cache["first_chunk"] = False
			cache["encoder"]["start_idx"] = -5
			wait = False
			else:
			if is_final:
			cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
			cache["encoder"]["pad_right"] = 0
			speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
			speech_length = torch.tensor([len(cache["speech_chunk"])])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = []
			wait = False
			else:
			break
			else:
			if len(cache["speech_chunk"]) >= 19200:
			cache["encoder"]["start_idx"] += 10
			cache["encoder"]["pad_left"] = 5
			speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
			speech_length = torch.tensor([19200])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = cache["speech_chunk"][9600:]
			wait = False
			else:
			if is_final:
			cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
			cache["encoder"]["pad_right"] = 0
			speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
			speech_length = torch.tensor([len(cache["speech_chunk"])])
			results = speech2text(cache, speech, speech_length)
			cache["speech_chunk"] = []
			wait = False
			else:
			break

			if len(results) >= 1:
			asr_result += results[0][0]
			if asr_result == "":
			asr_result = "sil"
			if wait:
			asr_result = "waiting_for_more_voice"
			item = {'key': "utt", 'value': asr_result}
			asr_result_list.append(item)
			else:
			writer = None
			if param_dict is not None and "cache" in param_dict:
			cache = param_dict["cache"]
			for keys, batch in loader:
			assert isinstance(batch, dict), type(batch)
			assert all(isinstance(s, str) for s in keys), keys
			_bs = len(next(iter(batch.values())))
			assert len(keys) == _bs, f"{len(keys)} != {_bs}"
			# batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
			logging.info("decoding, utt_id: {}".format(keys))
			# N-best list of (text, token, token_int, hyp_object)

			time_beg = time.time()
			results = speech2text(cache=cache, **batch)
			if len(results) < 1:
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			time_end = time.time()
			forward_time = time_end - time_beg
			lfr_factor = results[0][-1]
			length = results[0][-2]
			forward_time_total += forward_time
			length_total += length
			rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
			100 * forward_time / (
			length * lfr_factor))
			logging.info(rtf_cur)

			for batch_id in range(_bs):
			result = [results[batch_id][:-2]]

			key = keys[batch_id]
			for n, result in zip(range(1, nbest + 1), result):
			text, token, token_int, hyp = result[0], result[1], result[2], result[3]
			time_stamp = None if len(result) < 5 else result[4]
			# Create a directory: outdir/{n}best_recog
			if writer is not None:
			ibest_writer = writer[f"{n}best_recog"]

			# Write the result to each file
			ibest_writer["token"][key] = " ".join(token)
			# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
			ibest_writer["score"][key] = str(hyp.score)
			ibest_writer["rtf"][key] = rtf_cur

			if text is not None:
			if use_timestamp and time_stamp is not None:
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			time_stamp_postprocessed = ""
			if len(postprocessed_result) == 3:
			text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
			postprocessed_result[1], \
			postprocessed_result[2]
			else:
			text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
			item = {'key': key, 'value': text_postprocessed}
			if time_stamp_postprocessed != "":
			item['time_stamp'] = time_stamp_postprocessed
			asr_result_list.append(item)
			finish_count += 1
			# asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text_postprocessed

			logging.info("decoding, utt: {}, predictions: {}".format(key, text))
			rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
			forward_time_total,
			100 * forward_time_total / (
			length_total * lfr_factor))
			logging.info(rtf_avg)
			if writer is not None:
			ibest_writer["rtf"]["rtf_avf"] = rtf_avg
			return []
			return asr_result_list

			return _forward
			@@ -905,3 +868,4 @@
			# rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
			# print(rec_result)

			@@ -261,6 +261,7 @@

			# Change integer-ids to tokens
			token = self.converter.ids2tokens(token_int)
			token = list(filter(lambda x: x != "<gbg>", token))

			if self.tokenizer is not None:
			text = self.tokenizer.tokens2text(token)
			@@ -512,7 +513,7 @@
			finish_count += 1
			asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			ibest_writer["text"][key] = text
			ibest_writer["text"][key] = text_postprocessed
			return asr_result_list

			return _forward

			@@ -30,7 +30,8 @@
			from funasr.models.frontend.wav_frontend import WavFrontend
			from funasr.bin.vad_inference import Speech2VadSegment


			header_colors = '\033[95m'
			end_colors = '\033[0m'


			class Speech2VadSegmentOnline(Speech2VadSegment):
			@@ -55,7 +56,7 @@
			@torch.no_grad()
			def __call__(
			self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
			in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
			in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
			) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
			"""Inference

			@@ -86,7 +87,8 @@
			"feats": feats,
			"waveform": waveforms,
			"in_cache": in_cache,
			"is_final": is_final
			"is_final": is_final,
			"max_end_sil": max_end_sil
			}
			# a. To device
			batch = to_device(batch, device=self.device)
			@@ -217,6 +219,7 @@
			vad_results = []
			batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
			is_final = param_dict['is_final'] if param_dict is not None else False
			max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
			for keys, batch in loader:
			assert isinstance(batch, dict), type(batch)
			assert all(isinstance(s, str) for s in keys), keys
			@@ -224,6 +227,7 @@
			assert len(keys) == _bs, f"{len(keys)} != {_bs}"
			batch['in_cache'] = batch_in_cache
			batch['is_final'] = is_final
			batch['max_end_sil'] = max_end_sil

			# do vad segment
			_, results, param_dict['in_cache'] = speech2vadsegment(**batch)

			@@ -473,8 +473,9 @@
			return segments, in_cache

			def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
			is_final: bool = False
			is_final: bool = False, max_end_sil: int = 800
			) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
			self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
			self.waveform = waveform # compute decibel for each frame
			self.ComputeDecibel()
			self.ComputeScores(feats, in_cache)

			@@ -3,87 +3,81 @@
			The audio data is in streaming, the asr inference process is in offline.


			## Steps

			Step 1-1) Prepare server modelscope pipeline environment (on server).
			## For the Server

			Install modelscope and funasr with pip or with cuda-docker image.
			### Prepare server environment
			#### Backend is modelscope pipeline (default)
			Install the modelscope and funasr

			Option 1: Install modelscope and funasr with [pip](https://github.com/alibaba-damo-academy/FunASR#installation)

			Option 2: or install with cuda-docker image as:

			```
			CID=`docker run --network host -d -it --gpus '"device=0"' registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.2.0`
			echo $CID
			docker exec -it $CID /bin/bash
			```
			Get funasr source code and get into grpc directory.
			```
			git clone https://github.com/alibaba-damo-academy/FunASR
			cd FunASR/funasr/runtime/python/grpc/
			```

			Step 1-2) Optional, Prepare server onnxruntime environment (on server).

			Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

			- Build the onnx_paraformer `whl`
			```
			```shell
			pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/onnxruntime/rapid_paraformer
			python setup.py build
			python setup.py install
			pip install --editable ./
			```

			[//]: # ()
			[//]: # (- Install the build `whl`)
			Install the requirements

			[//]: # (```)
			```shell
			cd funasr/runtime/python/grpc
			pip install -r requirements_server.txt
			```

			[//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl)
			#### Backend is funasr_onnx (optional)

			[//]: # (```)
			Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

			```
			pip install funasr_onnx -i https://pypi.Python.org/simple
			```

			Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
			```shell
			python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
			```

			Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client).
			```
			# Optional, Install dependency.
			python -m pip install grpcio grpcio-tools
			```
			### Generate protobuf file
			Run on server, the two generated pb files are both used for server and client

			```
			```shell
			# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated,
			# regenerate it only when you make changes to ./proto/paraformer.proto file.
			python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
			```

			Step 3) Start grpc server (on server).
			```
			# Optional, Install dependency.
			python -m pip install grpcio grpcio-tools
			```
			### Start grpc server

			```
			# Start server.
			python grpc_main_server.py --port 10095 --backend pipeline
			```

			If you want run server with onnxruntime, please set `backend` and `onnx_dir` paramater.
			If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
			```
			# Start server.
			python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
			```

			## For the client

			Step 4) Start grpc client (on client with microphone).
			### Install the requirements

			```shell
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/grpc
			pip install -r requirements_client.txt
			```
			# Optional, Install dependency.
			python -m pip install pyaudio webrtcvad grpcio grpcio-tools

			### Generate protobuf file
			Run on server, the two generated pb files are both used for server and client

			```shell
			# paraformer_pb2.py and paraformer_pb2_grpc.py are already generated,
			# regenerate it only when you make changes to ./proto/paraformer.proto file.
			python -m grpc_tools.protoc --proto_path=./proto -I ./proto --python_out=. --grpc_python_out=./ ./proto/paraformer.proto
			```

			### Start grpc client
			```
			# Start client.
			python grpc_main_client_mic.py --host 127.0.0.1 --port 10095
			@@ -91,8 +85,8 @@


			## Workflow in desgin
			![avatar](proto/workflow.png)

			<div align="left"><img src="proto/workflow.png" width="400"/>

			## Reference
			We borrow from or refer to some code as:

			@@ -24,7 +24,7 @@
			self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
			elif self.backend == "onnxruntime":
			try:
			from rapid_paraformer.paraformer_onnx import Paraformer
			from funasr_onnx import Paraformer
			except ImportError:
			raise ImportError(f"Please install onnxruntime environment")
			self.inference_16k_pipeline = Paraformer(model_dir=onnx_dir)

			@@ -1,5 +1,6 @@
			## Using paraformer with libtorch
			## Using funasr with libtorch

			[FunASR](https://github.com/alibaba-damo-academy/FunASR) hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on ModelScope, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！

			### Introduction
			- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
			@@ -7,13 +8,6 @@
			### Steps:
			1. Export the model.
			- Command: (`Tips`: torch >= 1.11.0 is required.)

			```shell
			python -m funasr.export.export_model [model_name] [export_dir] false
			```
			`model_name`: the model is to export.

			`export_dir`: the dir where the onnx is export.

			More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

			@@ -27,13 +21,20 @@
			```


			2. Install the `torch_paraformer`.
			2. Install the `funasr_torch`.

			install from pip
			```shell
			pip install --upgrade funasr_torch -i https://pypi.Python.org/simple
			```
			or install from source code

			```shell
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/libtorch
			cd funasr/runtime/python/funasr_torch
			python setup.py build
			python setup.py install
			```


			3. Run the demo.
			- Model_dir: the model path, which contains `model.torchscripts`, `config.yaml`, `am.mvn`.
			@@ -41,7 +42,7 @@
			- Output: `List[str]`: recognition result.
			- Example:
			```python
			from torch_paraformer import Paraformer
			from funasr_torch import Paraformer

			model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1)
			@@ -65,3 +66,4 @@
			\| Onnx \| 0.038 \|

			## Acknowledge
			This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).

			@@ -1,5 +1,5 @@

			from torch_paraformer import Paraformer
			from funasr_torch import Paraformer

			model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1)

			@@ -14,8 +14,8 @@


			setuptools.setup(
			name='torch_paraformer',
			version='0.0.1',
			name='funasr_torch',
			version='0.0.3',
			platforms="Any",
			url="https://github.com/alibaba-damo-academy/FunASR.git",
			author="Speech Lab, Alibaba Group, China",
			@@ -31,7 +31,7 @@
			"PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
			packages=find_packages(include=["torch_paraformer*"]),
			keywords=[
			'funasr,paraformer'
			'funasr,paraformer, funasr_torch'
			],
			classifiers=[
			'Programming Language :: Python :: 3.6',

			@@ -1,9 +1,5 @@
			## Using paraformer with ONNXRuntime
			## Using funasr with ONNXRuntime

			<p align="left">
			<a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
			<a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
			</p>

			### Introduction
			- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
			@@ -12,13 +8,6 @@
			### Steps:
			1. Export the model.
			- Command: (`Tips`: torch >= 1.11.0 is required.)

			```shell
			python -m funasr.export.export_model [model_name] [export_dir] [true]
			```
			`model_name`: the model is to export.

			`export_dir`: the dir where the onnx is export.

			More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

			@@ -32,17 +21,21 @@
			```


			2. Install the `rapid_paraformer`.
			- Build the rapid_paraformer `whl`
			```shell
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/onnxruntime
			python setup.py bdist_wheel
			```
			- Install the build `whl`
			```bash
			pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl
			```
			2. Install the `funasr_onnx`

			install from pip
			```shell
			pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
			```

			or install from source code

			```shell
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/funasr_onnx
			python setup.py build
			python setup.py install
			```

			3. Run the demo.
			- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
			@@ -50,7 +43,7 @@
			- Output: `List[str]`: recognition result.
			- Example:
			```python
			from rapid_paraformer import Paraformer
			from funasr_onnx import Paraformer

			model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1)
			@@ -74,4 +67,5 @@


			## Acknowledge
			1. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime(python api).
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
			2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).

File was renamed from funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
			@@ -1,7 +1,5 @@
			# -- encoding: utf-8 --
			# @Author: SWHL
			# @Contact: liekkaskono@163.com
			from cgitb import text

			import os.path
			from pathlib import Path
			from typing import List, Union, Tuple

			@@ -12,17 +12,17 @@
			return readme


			MODULE_NAME = 'rapid_paraformer'
			VERSION_NUM = '0.0.1'
			MODULE_NAME = 'funasr_onnx'
			VERSION_NUM = '0.0.2'

			setuptools.setup(
			name=MODULE_NAME,
			version=VERSION_NUM,
			platforms="Any",
			description="Using paraformer with ONNXRuntime",
			author="FunASR",
			url="https://github.com/alibaba-damo-academy/FunASR.git",
			author="Speech Lab, Alibaba Group, China",
			author_email="funasr@list.alibaba-inc.com",
			url="https://github.com/alibaba-damo-academy/FunASR",
			description="FunASR: A Fundamental End-to-End Speech Recognition Toolkit",
			license='MIT',
			long_description=get_readme(),
			long_description_content_type='text/markdown',

			@@ -15,9 +15,9 @@
			args = parser.parse_args()


			from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
			from funasr.runtime.python.libtorch.funasr_torch import Paraformer
			if args.backend == "onnx":
			from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
			from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer

			model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)

			@@ -14,9 +14,9 @@
			args = parser.parse_args()


			from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
			from funasr.runtime.python.libtorch.funasr_torch import Paraformer
			if args.backend == "onnx":
			from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
			from funasr.runtime.python.onnxruntime.funasr_onnx import Paraformer

			model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)

			@@ -2,7 +2,6 @@
			We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
			The audio data is in streaming, the asr inference process is in offline.

			# Steps

			## For the Server