Shi Xian
2024-06-18 6c467e6f0abfc6d20d0621fbbf67b4dbd81776cc
Merge pull request #1825 from modelscope/dev_libt

Dev libt
32个文件已修改
3个文件已添加
1个文件已删除
587 ■■■■ 已修改文件
examples/industrial_data_pretraining/bicif_paraformer/export.py 18 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/paraformer/export.py 19 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/auto/auto_model.py 8 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets_qwenaudio/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/llm_datasets_vicuna/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/sense_voice_datasets/datasets.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/frontends/default.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/bicif_paraformer/cif_predictor.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/bicif_paraformer/export_meta.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/contextual_paraformer/decoder.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/contextual_paraformer/export_meta.py 19 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/lcbnet/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/llm_asr/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/llm_asr_nar/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/mfcca/mfcca_encoder.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/paraformer/export_meta.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/paraformer_streaming/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/sanm/attention.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/seaco_paraformer/export_meta.py 7 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/sense_voice/model.py 8 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/transformer/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/whisper/model.py 7 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/whisper_lid/model.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/utils/export_utils.py 131 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/demo.py 17 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/demo_contextual_paraformer.py 13 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/demo_paraformer.py 11 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/demo_seaco_paraformer.py 13 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/funasr_torch/paraformer_bin.py 239 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/libtorch/funasr_torch/utils/utils.py 24 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py 9 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/onnxruntime/funasr_onnx/paraformer_online_bin.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/onnxruntime/funasr_onnx/punc_bin.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
runtime/python/onnxruntime/funasr_onnx/vad_bin.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/bicif_paraformer/export.py
@@ -12,17 +12,17 @@
    device="cpu",
)
res = model.export(type="onnx", quantize=False)
res = model.export(type="torchscripts", quantize=False)
print(res)
# method2, inference from local path
from funasr import AutoModel
# # method2, inference from local path
# from funasr import AutoModel
model = AutoModel(
    model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    device="cpu",
)
# model = AutoModel(
#     model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
#     device="cpu",
# )
res = model.export(type="onnx", quantize=False)
print(res)
# res = model.export(type="onnx", quantize=False)
# print(res)
examples/industrial_data_pretraining/paraformer/export.py
@@ -10,19 +10,20 @@
from funasr import AutoModel
model = AutoModel(
    model="iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
    model="iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
)
res = model.export(type="onnx", quantize=False)
res = model.export(type="torchscripts", quantize=False)
# res = model.export(type="bladedisc", input=f"{model.model_path}/example/asr_example.wav")
print(res)
# method2, inference from local path
from funasr import AutoModel
# # method2, inference from local path
# from funasr import AutoModel
model = AutoModel(
    model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
)
# model = AutoModel(
#     model="/Users/zhifu/.cache/modelscope/hub/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
# )
res = model.export(type="onnx", quantize=False)
print(res)
# res = model.export(type="onnx", quantize=False)
# print(res)
funasr/auto/auto_model.py
@@ -602,12 +602,6 @@
        )
        with torch.no_grad():
            if type == "onnx":
                export_dir = export_utils.export_onnx(model=model, data_in=data_list, **kwargs)
            else:
                export_dir = export_utils.export_torchscripts(
                    model=model, data_in=data_list, **kwargs
                )
            export_dir = export_utils.export(model=model, data_in=data_list,  **kwargs)
        return export_dir
funasr/datasets/llm_datasets/datasets.py
@@ -64,8 +64,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/llm_datasets_qwenaudio/datasets.py
@@ -66,8 +66,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/llm_datasets_vicuna/datasets.py
@@ -66,8 +66,6 @@
    def __getitem__(self, index):
        item = self.index_ds[index]
        # import pdb;
        # pdb.set_trace()
        source = item["source"]
        data_src = load_audio_text_image_video(source, fs=self.fs)
        if self.preprocessor_speech:
funasr/datasets/sense_voice_datasets/datasets.py
@@ -72,8 +72,6 @@
        return len(self.index_ds)
    def __getitem__(self, index):
        # import pdb;
        # pdb.set_trace()
        output = None
        for idx in range(self.retry):
funasr/frontends/default.py
@@ -235,7 +235,6 @@
        self, input: torch.Tensor, input_lengths: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # 1. Domain-conversion: e.g. Stft: time -> time-freq
        # import pdb;pdb.set_trace()
        if self.stft is not None:
            input_stft, feats_lens = self._compute_stft(input, input_lengths)
        else:
funasr/models/bicif_paraformer/cif_predictor.py
@@ -198,7 +198,7 @@
            output2 = self.upsample_cnn(_output)
            output2 = output2.transpose(1, 2)
            output2, _ = self.self_attn(output2, mask)
        # import pdb; pdb.set_trace()
        alphas2 = torch.sigmoid(self.cif_output2(output2))
        alphas2 = torch.nn.functional.relu(alphas2 * self.smooth_factor2 - self.noise_threshold2)
        # repeat the mask in T demension to match the upsampled length
funasr/models/bicif_paraformer/export_meta.py
@@ -29,7 +29,8 @@
    model.export_input_names = types.MethodType(export_input_names, model)
    model.export_output_names = types.MethodType(export_output_names, model)
    model.export_dynamic_axes = types.MethodType(export_dynamic_axes, model)
    model.export_name = types.MethodType(export_name, model)
    model.export_name = "model"
    return model
funasr/models/contextual_paraformer/decoder.py
@@ -424,7 +424,6 @@
        # contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :]
        contextual_mask = self.make_pad_mask(contextual_length)
        contextual_mask, _ = self.prepare_mask(contextual_mask)
        # import pdb; pdb.set_trace()
        contextual_mask = contextual_mask.transpose(2, 1).unsqueeze(1)
        cx, tgt_mask, _, _, _ = self.bias_decoder(
            x_self_attn, tgt_mask, bias_embed, memory_mask=contextual_mask
funasr/models/contextual_paraformer/export_meta.py
@@ -17,6 +17,21 @@
        model.bias_encoder.batch_first = False
        self.bias_encoder = model.bias_encoder
    def export_dummy_inputs(self):
        hotword = torch.tensor(
            [
                [10, 11, 12, 13, 14, 10, 11, 12, 13, 14],
                [100, 101, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [10, 11, 12, 13, 14, 10, 11, 12, 13, 14],
                [100, 101, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            ],
            dtype=torch.int32,
        )
        # hotword_length = torch.tensor([10, 2, 1], dtype=torch.int32)
        return (hotword)
def export_rebuild_model(model, **kwargs):
    is_onnx = kwargs.get("type", "onnx") == "onnx"
@@ -59,7 +74,9 @@
    backbone_model.export_dynamic_axes = types.MethodType(
        export_backbone_dynamic_axes, backbone_model
    )
    backbone_model.export_name = types.MethodType(export_backbone_name, backbone_model)
    embedder_model.export_name = "model_eb"
    backbone_model.export_name = "model"
    return backbone_model, embedder_model
funasr/models/lcbnet/model.py
@@ -23,8 +23,6 @@
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
import pdb
@tables.register("model_classes", "LCBNet")
class LCBNet(nn.Module):
funasr/models/llm_asr/model.py
@@ -166,8 +166,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/llm_asr_nar/model.py
@@ -166,8 +166,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/mfcca/mfcca_encoder.py
@@ -34,7 +34,6 @@
from funasr.models.transformer.utils.subsampling import TooShortUttError
from funasr.models.transformer.utils.subsampling import check_short_utt
from funasr.models.encoder.abs_encoder import AbsEncoder
import pdb
import math
@@ -363,7 +362,6 @@
        t_leng = xs_pad.size(1)
        d_dim = xs_pad.size(2)
        xs_pad = xs_pad.reshape(-1, channel_size, t_leng, d_dim)
        # pdb.set_trace()
        if channel_size < 8:
            repeat_num = math.ceil(8 / channel_size)
            xs_pad = xs_pad.repeat(1, repeat_num, 1, 1)[:, 0:8, :, :]
funasr/models/paraformer/export_meta.py
@@ -31,6 +31,7 @@
    model.export_dynamic_axes = types.MethodType(export_dynamic_axes, model)
    model.export_name = types.MethodType(export_name, model)
    model.export_name = 'model'
    return model
funasr/models/paraformer_streaming/model.py
@@ -50,8 +50,6 @@
        super().__init__(*args, **kwargs)
        # import pdb;
        # pdb.set_trace()
        self.sampling_ratio = kwargs.get("sampling_ratio", 0.2)
        self.scama_mask = None
@@ -83,8 +81,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        decoding_ind = kwargs.get("decoding_ind")
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
funasr/models/sanm/attention.py
@@ -780,7 +780,7 @@
        return q, k, v
    def forward_attention(self, value, scores, mask, ret_attn):
        scores = scores + mask
        scores = scores + mask.to(scores.device)
        self.attn = torch.softmax(scores, dim=-1)
        context_layer = torch.matmul(self.attn, value)  # (batch, head, time1, d_k)
funasr/models/seaco_paraformer/export_meta.py
@@ -109,7 +109,9 @@
    backbone_model.export_dynamic_axes = types.MethodType(
        export_backbone_dynamic_axes, backbone_model
    )
    backbone_model.export_name = types.MethodType(export_backbone_name, backbone_model)
    embedder_model.export_name = "model_eb"
    backbone_model.export_name = "model"
    return backbone_model, embedder_model
@@ -198,6 +200,3 @@
        "us_cif_peak": {0: "batch_size", 1: "alphas_length"},
    }
def export_backbone_name(self):
    return "model.onnx"
funasr/models/sense_voice/model.py
@@ -73,8 +73,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -303,8 +301,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -648,8 +644,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
@@ -1052,8 +1046,6 @@
    ):
        target_mask = kwargs.get("target_mask", None)
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/transformer/model.py
@@ -145,8 +145,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/models/whisper/model.py
@@ -7,7 +7,10 @@
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import whisper
# import whisper_timestamped as whisper
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.register import tables
@@ -108,7 +111,9 @@
        # decode the audio
        options = whisper.DecodingOptions(**kwargs.get("DecodingOptions", {}))
        result = whisper.decode(self.model, speech, options)
        result = whisper.decode(self.model, speech, language='english')
        # result = whisper.transcribe(self.model, speech)
        results = []
        result_i = {"key": key[0], "text": result.text}
funasr/models/whisper_lid/model.py
@@ -140,8 +140,6 @@
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
funasr/utils/export_utils.py
@@ -1,8 +1,14 @@
import os
import torch
import functools
try:
    import torch_blade
except Exception as e:
    print(f"failed to load torch_blade: {e}")
def export_onnx(model, data_in=None, quantize: bool = False, opset_version: int = 14, **kwargs):
def export(model, data_in=None, quantize: bool = False, opset_version: int = 14, type='onnx', **kwargs):
    model_scripts = model.export(**kwargs)
    export_dir = kwargs.get("output_dir", os.path.dirname(kwargs.get("init_param")))
    os.makedirs(export_dir, exist_ok=True)
@@ -11,6 +17,7 @@
        model_scripts = (model_scripts,)
    for m in model_scripts:
        m.eval()
        if type == 'onnx':
        _onnx(
            m,
            data_in=data_in,
@@ -19,6 +26,23 @@
            export_dir=export_dir,
            **kwargs
        )
        elif type == 'torchscripts':
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print("Exporting torchscripts on device {}".format(device))
            _torchscripts(
                m,
                path=export_dir,
                device=device
            )
        elif type == "bladedisc":
            assert (
                torch.cuda.is_available()
            ), "Currently bladedisc optimization for FunASR only supports GPU"
            # bladedisc only optimizes encoder/decoder modules
            if hasattr(m, "encoder") and hasattr(m, "decoder"):
                _bladedisc_opt_for_encdec(m, path=export_dir, enable_fp16=True)
            else:
                _torchscripts(m, path=export_dir, device="cuda")
        print("output dir: {}".format(export_dir))
    return export_dir
@@ -37,7 +61,7 @@
    verbose = kwargs.get("verbose", False)
    export_name = model.export_name() if hasattr(model, "export_name") else "model.onnx"
    export_name = model.export_name + '.onnx'
    model_path = os.path.join(export_dir, export_name)
    torch.onnx.export(
        model,
@@ -70,3 +94,106 @@
                weight_type=QuantType.QUInt8,
                nodes_to_exclude=nodes_to_exclude,
            )
def _torchscripts(model, path, device='cuda'):
    dummy_input = model.export_dummy_inputs()
    if device == 'cuda':
        model = model.cuda()
        if isinstance(dummy_input, torch.Tensor):
            dummy_input = dummy_input.cuda()
        else:
            dummy_input = tuple([i.cuda() for i in dummy_input])
    model_script = torch.jit.trace(model, dummy_input)
    model_script.save(os.path.join(path, f'{model.export_name}.torchscripts'))
def _bladedisc_opt(model, model_inputs, enable_fp16=True):
    model = model.eval()
    torch_config = torch_blade.config.Config()
    torch_config.enable_fp16 = enable_fp16
    with torch.no_grad(), torch_config:
        opt_model = torch_blade.optimize(
            model,
            allow_tracing=True,
            model_inputs=model_inputs,
        )
    return opt_model
def _rescale_input_hook(m, x, scale):
    if len(x) > 1:
        return (x[0] / scale, *x[1:])
    else:
        return (x[0] / scale,)
def _rescale_output_hook(m, x, y, scale):
    if isinstance(y, tuple):
        return (y[0] / scale, *y[1:])
    else:
        return y / scale
def _rescale_encoder_model(model, input_data):
    # Calculate absmax
    absmax = torch.tensor(0).cuda()
    def stat_input_hook(m, x, y):
        val = x[0] if isinstance(x, tuple) else x
        absmax.copy_(torch.max(absmax, val.detach().abs().max()))
    encoders = model.encoder.model.encoders
    hooks = [m.register_forward_hook(stat_input_hook) for m in encoders]
    model = model.cuda()
    model(*input_data)
    for h in hooks:
        h.remove()
    # Rescale encoder modules
    fp16_scale = int(2 * absmax // 65536)
    print(f"rescale encoder modules with factor={fp16_scale}")
    model.encoder.model.encoders0.register_forward_pre_hook(
        functools.partial(_rescale_input_hook, scale=fp16_scale),
    )
    for name, m in model.encoder.model.named_modules():
        if name.endswith("self_attn"):
            m.register_forward_hook(
                functools.partial(_rescale_output_hook, scale=fp16_scale)
            )
        if name.endswith("feed_forward.w_2"):
            state_dict = {k: v / fp16_scale for k, v in m.state_dict().items()}
            m.load_state_dict(state_dict)
def _bladedisc_opt_for_encdec(model, path, enable_fp16):
    # Get input data
    # TODO: better to use real data
    input_data = model.export_dummy_inputs()
    if isinstance(input_data, torch.Tensor):
        input_data = input_data.cuda()
    else:
        input_data = tuple([i.cuda() for i in input_data])
    # Get input data for decoder module
    decoder_inputs = list()
    def get_input_hook(m, x):
        decoder_inputs.extend(list(x))
    hook = model.decoder.register_forward_pre_hook(get_input_hook)
    model = model.cuda()
    model(*input_data)
    hook.remove()
    # Prevent FP16 overflow
    if enable_fp16:
        _rescale_encoder_model(model, input_data)
    # Export and optimize encoder/decoder modules
    model.encoder = _bladedisc_opt(model.encoder, input_data[:2])
    model.decoder = _bladedisc_opt(model.decoder, tuple(decoder_inputs))
    model_script = torch.jit.trace(model, input_data)
    model_script.save(os.path.join(path, f"{model.export_name}_blade.torchscripts"))
runtime/python/libtorch/demo.py
File was deleted
runtime/python/libtorch/demo_contextual_paraformer.py
New file
@@ -0,0 +1,13 @@
import torch
from pathlib import Path
from funasr_torch.paraformer_bin import ContextualParaformer
model_dir = "iic/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
device_id = 0 if torch.cuda.is_available() else -1
model = ContextualParaformer(model_dir, batch_size=1, device_id=device_id)  # gpu
wav_path = "{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)
hotwords = "你的热词 魔搭"
result = model(wav_path, hotwords)
print(result)
runtime/python/libtorch/demo_paraformer.py
New file
@@ -0,0 +1,11 @@
from pathlib import Path
from funasr_torch.paraformer_bin import Paraformer
model_dir = "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1)  # cpu
# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu
wav_path = "{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)
result = model(wav_path)
print(result)
runtime/python/libtorch/demo_seaco_paraformer.py
New file
@@ -0,0 +1,13 @@
import torch
from pathlib import Path
from funasr_torch.paraformer_bin import SeacoParaformer
model_dir = "iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
device_id = 0 if torch.cuda.is_available() else -1
model = SeacoParaformer(model_dir, batch_size=1, device_id=device_id)  # gpu
wav_path = "{}/.cache/modelscope/hub/{}/example/asr_example.wav".format(Path.home(), model_dir)
hotwords = "你的热词 魔搭"
result = model(wav_path, hotwords)
print(result)
runtime/python/libtorch/funasr_torch/paraformer_bin.py
@@ -1,23 +1,29 @@
# -*- encoding: utf-8 -*-
import json
import copy
import torch
import os.path
import librosa
import numpy as np
from pathlib import Path
from typing import List, Union, Tuple
import copy
import librosa
import numpy as np
from .utils.utils import CharTokenizer, Hypothesis, TokenIDConverter, get_logger, read_yaml
from .utils.postprocess_utils import sentence_postprocess
from .utils.utils import pad_list
from .utils.frontend import WavFrontend
from .utils.timestamp_utils import time_stamp_lfr6_onnx
from .utils.postprocess_utils import sentence_postprocess
from .utils.utils import CharTokenizer, Hypothesis, TokenIDConverter, get_logger, read_yaml
logging = get_logger()
import torch
class Paraformer:
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
    def __init__(
        self,
        model_dir: Union[str, Path] = None,
@@ -25,20 +31,42 @@
        device_id: Union[str, int] = "-1",
        plot_timestamp_to: str = "",
        quantize: bool = False,
        intra_op_num_threads: int = 1,
        cache_dir: str = None,
        **kwargs,
    ):
        if not Path(model_dir).exists():
            raise FileNotFoundError(f"{model_dir} does not exist.")
            try:
                from modelscope.hub.snapshot_download import snapshot_download
            except:
                raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
                    model_dir
                )
        model_file = os.path.join(model_dir, "model.torchscripts")
        if quantize:
            model_file = os.path.join(model_dir, "model_quant.torchscripts")
        if not os.path.exists(model_file):
            print(".torchscripts does not exist, begin to export torchscripts")
            try:
                from funasr import AutoModel
            except:
                raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
            model = AutoModel(model=model_dir)
            model_dir = model.export(type="torchscript", quantize=quantize, **kwargs)
        config_file = os.path.join(model_dir, "config.yaml")
        cmvn_file = os.path.join(model_dir, "am.mvn")
        config = read_yaml(config_file)
        token_list = os.path.join(model_dir, "tokens.json")
        with open(token_list, "r", encoding="utf-8") as f:
            token_list = json.load(f)
        self.converter = TokenIDConverter(config["token_list"])
        self.converter = TokenIDConverter(token_list)
        self.tokenizer = CharTokenizer()
        self.frontend = WavFrontend(cmvn_file=cmvn_file, **config["frontend_conf"])
        self.ort_infer = torch.jit.load(model_file)
@@ -49,6 +77,10 @@
            self.pred_bias = config["model_conf"]["predictor_bias"]
        else:
            self.pred_bias = 0
        if "lang" in config:
            self.language = config["lang"]
        else:
            self.language = None
    def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
@@ -203,3 +235,186 @@
        token = token[: valid_token_num - self.pred_bias]
        # texts = sentence_postprocess(token)
        return token
class ContextualParaformer(Paraformer):
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
    def __init__(
        self,
        model_dir: Union[str, Path] = None,
        batch_size: int = 1,
        device_id: Union[str, int] = "-1",
        plot_timestamp_to: str = "",
        quantize: bool = False,
        cache_dir: str = None,
        **kwargs,
    ):
        if not Path(model_dir).exists():
            try:
                from modelscope.hub.snapshot_download import snapshot_download
            except:
                raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
                    model_dir
                )
        if quantize:
            model_bb_file = os.path.join(model_dir, "model_bb_quant.torchscripts")
            model_eb_file = os.path.join(model_dir, "model_eb_quant.torchscripts")
        else:
            model_bb_file = os.path.join(model_dir, "model_bb.torchscripts")
            model_eb_file = os.path.join(model_dir, "model_eb.torchscripts")
        if not (os.path.exists(model_eb_file) and os.path.exists(model_bb_file)):
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
                raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
            model = AutoModel(model=model_dir)
            model_dir = model.export(type="torchscripts", quantize=quantize, **kwargs)
        config_file = os.path.join(model_dir, "config.yaml")
        cmvn_file = os.path.join(model_dir, "am.mvn")
        config = read_yaml(config_file)
        token_list = os.path.join(model_dir, "tokens.json")
        with open(token_list, "r", encoding="utf-8") as f:
            token_list = json.load(f)
        # revert token_list into vocab dict
        self.vocab = {}
        for i, token in enumerate(token_list):
            self.vocab[token] = i
        self.converter = TokenIDConverter(token_list)
        self.tokenizer = CharTokenizer()
        self.frontend = WavFrontend(cmvn_file=cmvn_file, **config["frontend_conf"])
        self.ort_infer_bb = torch.jit.load(model_bb_file)
        self.ort_infer_eb = torch.jit.load(model_eb_file)
        self.device_id = device_id
        self.batch_size = batch_size
        self.plot_timestamp_to = plot_timestamp_to
        if "predictor_bias" in config["model_conf"].keys():
            self.pred_bias = config["model_conf"]["predictor_bias"]
        else:
            self.pred_bias = 0
    def __call__(
        self, wav_content: Union[str, np.ndarray, List[str]], hotwords: str, **kwargs
    ) -> List:
        # make hotword list
        hotwords, hotwords_length = self.proc_hotword(hotwords)
        if int(self.device_id) != -1:
            bias_embed = self.eb_infer(hotwords.cuda())
        else:
            bias_embed = self.eb_infer(hotwords)
        # index from bias_embed
        bias_embed = torch.transpose(bias_embed, 0, 1)
        _ind = np.arange(0, len(hotwords)).tolist()
        bias_embed = bias_embed[_ind, hotwords_length.tolist()]
        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
        waveform_nums = len(waveform_list)
        asr_res = []
        for beg_idx in range(0, waveform_nums, self.batch_size):
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
            bias_embed = torch.unsqueeze(bias_embed, 0).repeat(feats.shape[0], 1, 1)
            try:
                with torch.no_grad():
                    if int(self.device_id) == -1:
                        outputs = self.bb_infer(feats, feats_len, bias_embed)
                        am_scores, valid_token_lens = outputs[0], outputs[1]
                    else:
                        outputs = self.bb_infer(feats.cuda(), feats_len.cuda(), bias_embed.cuda())
                        am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
            except:
                # logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                preds = [""]
            else:
                preds = self.decode(am_scores, valid_token_lens)
                for pred in preds:
                    pred = sentence_postprocess(pred)
                    asr_res.append({"preds": pred})
        return asr_res
    def proc_hotword(self, hotwords):
        hotwords = hotwords.split(" ")
        hotwords_length = [len(i) - 1 for i in hotwords]
        hotwords_length.append(0)
        hotwords_length = np.array(hotwords_length)
        # hotwords.append('<s>')
        def word_map(word):
            hotwords = []
            for c in word:
                if c not in self.vocab.keys():
                    hotwords.append(8403)
                    logging.warning(
                        "oov character {} found in hotword {}, replaced by <unk>".format(c, word)
                    )
                else:
                    hotwords.append(self.vocab[c])
            return np.array(hotwords)
        hotword_int = [word_map(i) for i in hotwords]
        hotword_int.append(np.array([1]))
        hotwords = pad_list(hotword_int, pad_value=0, max_len=10)
        return torch.tensor(hotwords), hotwords_length
    def bb_infer(
        self, feats, feats_len, bias_embed
    ):
        outputs = self.ort_infer_bb(feats, feats_len, bias_embed)
        return outputs
    def eb_infer(self, hotwords):
        outputs = self.ort_infer_eb(hotwords.long())
        return outputs
    def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
        return [
            self.decode_one(am_score, token_num)
            for am_score, token_num in zip(am_scores, token_nums)
        ]
    def decode_one(self, am_score: np.ndarray, valid_token_num: int) -> List[str]:
        yseq = am_score.argmax(axis=-1)
        score = am_score.max(axis=-1)
        score = np.sum(score, axis=-1)
        # pad with mask tokens to ensure compatibility with sos/eos tokens
        # asr_model.sos:1  asr_model.eos:2
        yseq = np.array([1] + yseq.tolist() + [2])
        hyp = Hypothesis(yseq=yseq, score=score)
        # remove sos/eos and get results
        last_pos = -1
        token_int = hyp.yseq[1:last_pos].tolist()
        # remove blank symbol id, which is assumed to be 0
        token_int = list(filter(lambda x: x not in (0, 2), token_int))
        # Change integer-ids to tokens
        token = self.converter.ids2tokens(token_int)
        token = token[: valid_token_num - self.pred_bias]
        # texts = sentence_postprocess(token)
        return token
class SeacoParaformer(ContextualParaformer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # no difference with contextual_paraformer in method of calling onnx models
runtime/python/libtorch/funasr_torch/utils/timestamp_utils.py
@@ -7,7 +7,7 @@
    START_END_THRESHOLD = 5
    MAX_TOKEN_DURATION = 30
    TIME_RATE = 10.0 * 6 / 1000 / 3  #  3 times upsampled
    cif_peak = us_cif_peak.reshape(-1)
    cif_peak = us_cif_peak.reshape(-1).cpu()
    num_frames = cif_peak.shape[-1]
    if char_list[-1] == "</s>":
        char_list = char_list[:-1]
runtime/python/libtorch/funasr_torch/utils/utils.py
@@ -1,21 +1,25 @@
# -*- encoding: utf-8 -*-
import functools
import yaml
import logging
import pickle
import functools
import numpy as np
from pathlib import Path
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
import numpy as np
import yaml
import warnings
root_dir = Path(__file__).resolve().parent
logger_initialized = {}
def pad_list(xs, pad_value, max_len=None):
    n_batch = len(xs)
    if max_len is None:
        max_len = max(x.size(0) for x in xs)
    # pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
    # numpy format
    pad = (np.zeros((n_batch, max_len)) + pad_value).astype(np.int32)
    for i in range(n_batch):
        pad[i, : xs[i].shape[0]] = xs[i]
    return pad
class TokenIDConverter:
    def __init__(
runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py
@@ -62,7 +62,7 @@
        if quantize:
            model_file = os.path.join(model_dir, "model_quant.onnx")
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
@@ -285,7 +285,7 @@
            model_eb_file = os.path.join(model_dir, "model_eb.onnx")
        if not (os.path.exists(model_eb_file) and os.path.exists(model_bb_file)):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
@@ -331,7 +331,6 @@
    # ) -> List:
        # make hotword list
        hotwords, hotwords_length = self.proc_hotword(hotwords)
        # import pdb; pdb.set_trace()
        [bias_embed] = self.eb_infer(hotwords, hotwords_length)
        # index from bias_embed
        bias_embed = bias_embed.transpose(1, 0, 2)
@@ -411,10 +410,10 @@
            return np.array(hotwords)
        hotword_int = [word_map(i) for i in hotwords]
        # import pdb; pdb.set_trace()
        hotword_int.append(np.array([1]))
        hotwords = pad_list(hotword_int, pad_value=0, max_len=10)
        # import pdb; pdb.set_trace()
        return hotwords, hotwords_length
    def bb_infer(
runtime/python/onnxruntime/funasr_onnx/paraformer_online_bin.py
@@ -54,7 +54,7 @@
            encoder_model_file = os.path.join(model_dir, "model_quant.onnx")
            decoder_model_file = os.path.join(model_dir, "decoder_quant.onnx")
        if not os.path.exists(encoder_model_file) or not os.path.exists(decoder_model_file):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
runtime/python/onnxruntime/funasr_onnx/punc_bin.py
@@ -52,7 +52,7 @@
        if quantize:
            model_file = os.path.join(model_dir, "model_quant.onnx")
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
runtime/python/onnxruntime/funasr_onnx/vad_bin.py
@@ -52,7 +52,7 @@
        if quantize:
            model_file = os.path.join(model_dir, "model_quant.onnx")
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except:
@@ -221,7 +221,7 @@
        if quantize:
            model_file = os.path.join(model_dir, "model_quant.onnx")
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            print(".onnx does not exist, begin to export onnx")
            try:
                from funasr import AutoModel
            except: