python/FunASR-XL.git

parent: d6d1ca1f | 补丁 | 提交 | ignore whitespace

游雁

2023-02-13 1aba91a4b8c60926d97de18ec2e9470c11d416b7

export model

5个文件已修改

4个文件已删除

1个文件已添加

	.gitignore	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE	201 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py	459 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py	43 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py	137 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py	123 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 .gitignore

@@ -6,3 +6,4 @@
.DS_Store
init_model/
*.tar.gz
test_local/

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py

@@ -1,4 +1,3 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com
from .paraformer_onnx import Paraformer
# @Contact: liekkaskono@163.com

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE

File was deleted

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py

File was deleted

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py

File was deleted

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py

File was deleted

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py

@@ -10,9 +10,10 @@
import numpy as np

from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
                    OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
                    OrtInferSession, TokenIDConverter, get_logger,
                    read_yaml)
from .utils.postprocess_utils import sentence_postprocess
from .utils.frontend import WavFrontend

logging = get_logger()

@@ -65,7 +66,7 @@
                  wav_content: Union[str, np.ndarray, List[str]]) -> List:
        def load_wav(path: str) -> np.ndarray:
            waveform, _ = librosa.load(path, sr=None)
            return waveform[None, ...]
            return waveform

        if isinstance(wav_content, np.ndarray):
            return [wav_content]

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt

@@ -2,4 +2,5 @@
numpy
onnxruntime
scipy
typeguard
typeguard
kaldi-native-fbank

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py

New file
@@ -0,0 +1,137 @@
# -*- encoding: utf-8 -*-
from pathlib import Path
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

import numpy as np
from typeguard import check_argument_types
import kaldi_native_fbank as knf

root_dir = Path(__file__).resolve().parent

logger_initialized = {}


class WavFrontend():
    """Conventional frontend structure for ASR.
    """

    def __init__(
            self,
            cmvn_file: str = None,
            fs: int = 16000,
            window: str = 'hamming',
            n_mels: int = 80,
            frame_length: int = 25.0,
            frame_shift: int = 10,
            filter_length_min: int = -1,
            filter_length_max: float = -1,
            lfr_m: int = 1,
            lfr_n: int = 1,
            dither: float = 1.0
    ) -> None:
        check_argument_types()

        opts = knf.FbankOptions()
        opts.frame_opts.samp_freq = fs
        opts.frame_opts.dither = dither
        opts.frame_opts.window_type = window
        opts.frame_opts.frame_shift_ms = float(frame_shift)
        opts.frame_opts.frame_length_ms = float(frame_length)
        opts.mel_opts.num_bins = n_mels
        opts.energy_floor = 0
        opts.frame_opts.snip_edges = True
        opts.mel_opts.debug_mel = False
        self.opts = opts

        self.compute_fbank_feats = knf.OnlineFbank(self.opts)

        self.filter_length_min = filter_length_min
        self.filter_length_max = filter_length_max
        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.cmvn_file = cmvn_file

        if self.cmvn_file:
            self.cmvn = self.load_cmvn()

    def fbank(self,
              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        waveform = waveform * (1 << 15)
        self.compute_fbank_feats.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
        frames = self.compute_fbank_feats.num_frames_ready
        mat = np.empty([frames, self.opts.mel_opts.num_bins])
        for i in range(frames):
            mat[i, :] = self.compute_fbank_feats.get_frame(i)
        feat = mat.astype(np.float32)
        feat_len = np.array(mat.shape[0]).astype(np.int32)
        return feat, feat_len

    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        if self.lfr_m != 1 or self.lfr_n != 1:
            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

        if self.cmvn_file:
            feat = self.apply_cmvn(feat)

        feat_len = np.array(feat.shape[0]).astype(np.int32)
        return feat, feat_len

    @staticmethod
    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
        LFR_inputs = []

        T = inputs.shape[0]
        T_lfr = int(np.ceil(T / lfr_n))
        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
        inputs = np.vstack((left_padding, inputs))
        T = T + (lfr_m - 1) // 2
        for i in range(T_lfr):
            if lfr_m <= T - i * lfr_n:
                LFR_inputs.append(
                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
            else:
                # process last LFR frame
                num_padding = lfr_m - (T - i * lfr_n)
                frame = inputs[i * lfr_n:].reshape(-1)
                for _ in range(num_padding):
                    frame = np.hstack((frame, inputs[-1]))

                LFR_inputs.append(frame)
        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
        return LFR_outputs

    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
        """
        Apply CMVN with mvn data
        """
        frame, dim = inputs.shape
        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
        inputs = (inputs + means) * vars
        return inputs

    def load_cmvn(self,) -> np.ndarray:
        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        means_list = []
        vars_list = []
        for i in range(len(lines)):
            line_item = lines[i].split()
            if line_item[0] == '<AddShift>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    add_shift_line = line_item[3:(len(line_item) - 1)]
                    means_list = list(add_shift_line)
                    continue
            elif line_item[0] == '<Rescale>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    rescale_line = line_item[3:(len(line_item) - 1)]
                    vars_list = list(rescale_line)
                    continue

        means = np.array(means_list).astype(np.float64)
        vars = np.array(vars_list).astype(np.float64)
        cmvn = np.array([means, vars])
        return cmvn

 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py

@@ -13,7 +13,6 @@
                         SessionOptions, get_available_providers, get_device)
from typeguard import check_argument_types

from funasr.runtime.python.onnxruntime.paraformer.rapid_paraformer.kaldifeat import compute_fbank_feats
import warnings

root_dir = Path(__file__).resolve().parent
@@ -120,128 +119,6 @@
            f")"
        )


class WavFrontend():
    """Conventional frontend structure for ASR.
    """

    def __init__(
            self,
            cmvn_file: str = None,
            fs: int = 16000,
            window: str = 'hamming',
            n_mels: int = 80,
            frame_length: int = 25,
            frame_shift: int = 10,
            filter_length_min: int = -1,
            filter_length_max: float = -1,
            lfr_m: int = 1,
            lfr_n: int = 1,
            dither: float = 1.0
    ) -> None:
        check_argument_types()

        self.fs = fs
        self.window = window
        self.n_mels = n_mels
        self.frame_length = frame_length
        self.frame_shift = frame_shift
        self.filter_length_min = filter_length_min
        self.filter_length_max = filter_length_max
        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.cmvn_file = cmvn_file
        self.dither = dither

        if self.cmvn_file:
            self.cmvn = self.load_cmvn()

    def fbank(self,
              input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        waveform_len = input_content.shape[1]
        waveform = input_content[0][:waveform_len]
        waveform = waveform * (1 << 15)
        mat = compute_fbank_feats(waveform,
                                  num_mel_bins=self.n_mels,
                                  frame_length=self.frame_length,
                                  frame_shift=self.frame_shift,
                                  dither=self.dither,
                                  energy_floor=0.0,
                                  sample_frequency=self.fs,
                                  window_type=self.window)
        feat = mat.astype(np.float32)
        feat_len = np.array(mat.shape[0]).astype(np.int32)
        return feat, feat_len

    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        if self.lfr_m != 1 or self.lfr_n != 1:
            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

        if self.cmvn_file:
            feat = self.apply_cmvn(feat)

        feat_len = np.array(feat.shape[0]).astype(np.int32)
        return feat, feat_len

    @staticmethod
    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
        LFR_inputs = []

        T = inputs.shape[0]
        T_lfr = int(np.ceil(T / lfr_n))
        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
        inputs = np.vstack((left_padding, inputs))
        T = T + (lfr_m - 1) // 2
        for i in range(T_lfr):
            if lfr_m <= T - i * lfr_n:
                LFR_inputs.append(
                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
            else:
                # process last LFR frame
                num_padding = lfr_m - (T - i * lfr_n)
                frame = inputs[i * lfr_n:].reshape(-1)
                for _ in range(num_padding):
                    frame = np.hstack((frame, inputs[-1]))

                LFR_inputs.append(frame)
        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
        return LFR_outputs

    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
        """
        Apply CMVN with mvn data
        """
        frame, dim = inputs.shape
        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
        inputs = (inputs + means) * vars
        return inputs

    def load_cmvn(self,) -> np.ndarray:
        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        means_list = []
        vars_list = []
        for i in range(len(lines)):
            line_item = lines[i].split()
            if line_item[0] == '<AddShift>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    add_shift_line = line_item[3:(len(line_item) - 1)]
                    means_list = list(add_shift_line)
                    continue
            elif line_item[0] == '<Rescale>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    rescale_line = line_item[3:(len(line_item) - 1)]
                    vars_list = list(rescale_line)
                    continue

        means = np.array(means_list).astype(np.float64)
        vars = np.array(vars_list).astype(np.float64)
        cmvn = np.array([means, vars])
        return cmvn


class Hypothesis(NamedTuple):

			@@ -1,4 +1,3 @@
			# -- encoding: utf-8 --
			# @Author: SWHL
			# @Contact: liekkaskono@163.com
			from .paraformer_onnx import Paraformer
			# @Contact: liekkaskono@163.com

			@@ -10,9 +10,10 @@
			import numpy as np

			from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
			OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
			OrtInferSession, TokenIDConverter, get_logger,
			read_yaml)
			from .utils.postprocess_utils import sentence_postprocess
			from .utils.frontend import WavFrontend

			logging = get_logger()

			@@ -65,7 +66,7 @@
			wav_content: Union[str, np.ndarray, List[str]]) -> List:
			def load_wav(path: str) -> np.ndarray:
			waveform, _ = librosa.load(path, sr=None)
			return waveform[None, ...]
			return waveform

			if isinstance(wav_content, np.ndarray):
			return [wav_content]

			@@ -2,4 +2,5 @@
			numpy
			onnxruntime
			scipy
			typeguard
			typeguard
			kaldi-native-fbank

New file
			@@ -0,0 +1,137 @@
			# -- encoding: utf-8 --
			from pathlib import Path
			from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

			import numpy as np
			from typeguard import check_argument_types
			import kaldi_native_fbank as knf

			root_dir = Path(__file__).resolve().parent

			logger_initialized = {}


			class WavFrontend():
			"""Conventional frontend structure for ASR.
			"""

			def __init__(
			self,
			cmvn_file: str = None,
			fs: int = 16000,
			window: str = 'hamming',
			n_mels: int = 80,
			frame_length: int = 25.0,
			frame_shift: int = 10,
			filter_length_min: int = -1,
			filter_length_max: float = -1,
			lfr_m: int = 1,
			lfr_n: int = 1,
			dither: float = 1.0
			) -> None:
			check_argument_types()

			opts = knf.FbankOptions()
			opts.frame_opts.samp_freq = fs
			opts.frame_opts.dither = dither
			opts.frame_opts.window_type = window
			opts.frame_opts.frame_shift_ms = float(frame_shift)
			opts.frame_opts.frame_length_ms = float(frame_length)
			opts.mel_opts.num_bins = n_mels
			opts.energy_floor = 0
			opts.frame_opts.snip_edges = True
			opts.mel_opts.debug_mel = False
			self.opts = opts

			self.compute_fbank_feats = knf.OnlineFbank(self.opts)

			self.filter_length_min = filter_length_min
			self.filter_length_max = filter_length_max
			self.lfr_m = lfr_m
			self.lfr_n = lfr_n
			self.cmvn_file = cmvn_file

			if self.cmvn_file:
			self.cmvn = self.load_cmvn()

			def fbank(self,
			waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			waveform = waveform * (1 << 15)
			self.compute_fbank_feats.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
			frames = self.compute_fbank_feats.num_frames_ready
			mat = np.empty([frames, self.opts.mel_opts.num_bins])
			for i in range(frames):
			mat[i, :] = self.compute_fbank_feats.get_frame(i)
			feat = mat.astype(np.float32)
			feat_len = np.array(mat.shape[0]).astype(np.int32)
			return feat, feat_len

			def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			if self.lfr_m != 1 or self.lfr_n != 1:
			feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

			if self.cmvn_file:
			feat = self.apply_cmvn(feat)

			feat_len = np.array(feat.shape[0]).astype(np.int32)
			return feat, feat_len

			@staticmethod
			def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
			LFR_inputs = []

			T = inputs.shape[0]
			T_lfr = int(np.ceil(T / lfr_n))
			left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
			inputs = np.vstack((left_padding, inputs))
			T = T + (lfr_m - 1) // 2
			for i in range(T_lfr):
			if lfr_m <= T - i * lfr_n:
			LFR_inputs.append(
			(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
			else:
			# process last LFR frame
			num_padding = lfr_m - (T - i * lfr_n)
			frame = inputs[i * lfr_n:].reshape(-1)
			for _ in range(num_padding):
			frame = np.hstack((frame, inputs[-1]))

			LFR_inputs.append(frame)
			LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
			return LFR_outputs

			def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
			"""
			Apply CMVN with mvn data
			"""
			frame, dim = inputs.shape
			means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
			vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
			inputs = (inputs + means) * vars
			return inputs

			def load_cmvn(self,) -> np.ndarray:
			with open(self.cmvn_file, 'r', encoding='utf-8') as f:
			lines = f.readlines()

			means_list = []
			vars_list = []
			for i in range(len(lines)):
			line_item = lines[i].split()
			if line_item[0] == '<AddShift>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			add_shift_line = line_item[3:(len(line_item) - 1)]
			means_list = list(add_shift_line)
			continue
			elif line_item[0] == '<Rescale>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			rescale_line = line_item[3:(len(line_item) - 1)]
			vars_list = list(rescale_line)
			continue

			means = np.array(means_list).astype(np.float64)
			vars = np.array(vars_list).astype(np.float64)
			cmvn = np.array([means, vars])
			return cmvn

			@@ -13,7 +13,6 @@
			SessionOptions, get_available_providers, get_device)
			from typeguard import check_argument_types

			from funasr.runtime.python.onnxruntime.paraformer.rapid_paraformer.kaldifeat import compute_fbank_feats
			import warnings

			root_dir = Path(__file__).resolve().parent
			@@ -120,128 +119,6 @@
			f")"
			)


			class WavFrontend():
			"""Conventional frontend structure for ASR.
			"""

			def __init__(
			self,
			cmvn_file: str = None,
			fs: int = 16000,
			window: str = 'hamming',
			n_mels: int = 80,
			frame_length: int = 25,
			frame_shift: int = 10,
			filter_length_min: int = -1,
			filter_length_max: float = -1,
			lfr_m: int = 1,
			lfr_n: int = 1,
			dither: float = 1.0
			) -> None:
			check_argument_types()

			self.fs = fs
			self.window = window
			self.n_mels = n_mels
			self.frame_length = frame_length
			self.frame_shift = frame_shift
			self.filter_length_min = filter_length_min
			self.filter_length_max = filter_length_max
			self.lfr_m = lfr_m
			self.lfr_n = lfr_n
			self.cmvn_file = cmvn_file
			self.dither = dither

			if self.cmvn_file:
			self.cmvn = self.load_cmvn()

			def fbank(self,
			input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			waveform_len = input_content.shape[1]
			waveform = input_content[0][:waveform_len]
			waveform = waveform * (1 << 15)
			mat = compute_fbank_feats(waveform,
			num_mel_bins=self.n_mels,
			frame_length=self.frame_length,
			frame_shift=self.frame_shift,
			dither=self.dither,
			energy_floor=0.0,
			sample_frequency=self.fs,
			window_type=self.window)
			feat = mat.astype(np.float32)
			feat_len = np.array(mat.shape[0]).astype(np.int32)
			return feat, feat_len

			def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			if self.lfr_m != 1 or self.lfr_n != 1:
			feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

			if self.cmvn_file:
			feat = self.apply_cmvn(feat)

			feat_len = np.array(feat.shape[0]).astype(np.int32)
			return feat, feat_len

			@staticmethod
			def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
			LFR_inputs = []

			T = inputs.shape[0]
			T_lfr = int(np.ceil(T / lfr_n))
			left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
			inputs = np.vstack((left_padding, inputs))
			T = T + (lfr_m - 1) // 2
			for i in range(T_lfr):
			if lfr_m <= T - i * lfr_n:
			LFR_inputs.append(
			(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
			else:
			# process last LFR frame
			num_padding = lfr_m - (T - i * lfr_n)
			frame = inputs[i * lfr_n:].reshape(-1)
			for _ in range(num_padding):
			frame = np.hstack((frame, inputs[-1]))

			LFR_inputs.append(frame)
			LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
			return LFR_outputs

			def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
			"""
			Apply CMVN with mvn data
			"""
			frame, dim = inputs.shape
			means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
			vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
			inputs = (inputs + means) * vars
			return inputs

			def load_cmvn(self,) -> np.ndarray:
			with open(self.cmvn_file, 'r', encoding='utf-8') as f:
			lines = f.readlines()

			means_list = []
			vars_list = []
			for i in range(len(lines)):
			line_item = lines[i].split()
			if line_item[0] == '<AddShift>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			add_shift_line = line_item[3:(len(line_item) - 1)]
			means_list = list(add_shift_line)
			continue
			elif line_item[0] == '<Rescale>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			rescale_line = line_item[3:(len(line_item) - 1)]
			vars_list = list(rescale_line)
			continue

			means = np.array(means_list).astype(np.float64)
			vars = np.array(vars_list).astype(np.float64)
			cmvn = np.array([means, vars])
			return cmvn


			class Hypothesis(NamedTuple):