From 1aba91a4b8c60926d97de18ec2e9470c11d416b7 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 13 二月 2023 19:55:46 +0800
Subject: [PATCH] export model

---
 /dev/null                                                                        |   43 -------
 .gitignore                                                                       |    1 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py     |  123 --------------------
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py  |  137 ++++++++++++++++++++++
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py |    5 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt   |    3 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py        |    3 
 7 files changed, 144 insertions(+), 171 deletions(-)

diff --git a/.gitignore b/.gitignore
index f8b55d7..7a43407 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 .DS_Store
 init_model/
 *.tar.gz
+test_local/
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
index f1b5c29..d071b07 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
@@ -1,4 +1,3 @@
 # -*- encoding: utf-8 -*-
 # @Author: SWHL
-# @Contact: liekkaskono@163.com
-from .paraformer_onnx import Paraformer
+# @Contact: liekkaskono@163.com
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
deleted file mode 100644
index f9cf273..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# -*- encoding: utf-8 -*-
-from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding
-from .ivector import compute_vad
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
deleted file mode 100644
index a6c6a6c..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
+++ /dev/null
@@ -1,459 +0,0 @@
-import numpy as np
-from scipy.fftpack import dct
-
-
-# ---------- feature-window ----------
-
-def sliding_window(x, window_size, window_shift):
-    shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
-    strides = x.strides + (x.strides[-1],)
-    return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
-
-
-def func_num_frames(num_samples, window_size, window_shift, snip_edges):
-    if snip_edges:
-        if num_samples < window_size:
-            return 0
-        else:
-            return 1 + ((num_samples - window_size) // window_shift)
-    else:
-        return (num_samples + (window_shift // 2)) // window_shift
-
-
-def func_dither(waveform, dither_value):
-    if dither_value == 0.0:
-        return waveform
-    waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value
-    return waveform
-
-
-def func_remove_dc_offset(waveform):
-    return waveform - np.mean(waveform)
-
-
-def func_log_energy(waveform):
-    return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
-
-
-def func_preemphasis(waveform, preemph_coeff):
-    if preemph_coeff == 0.0:
-        return waveform
-    assert 0 < preemph_coeff <= 1
-    waveform[1:] -= preemph_coeff * waveform[:-1]
-    waveform[0] -= preemph_coeff * waveform[0]
-    return waveform
-
-
-def sine(M):
-    if M < 1:
-        return np.array([])
-    if M == 1:
-        return np.ones(1, float)
-    n = np.arange(0, M)
-    return np.sin(np.pi*n/(M-1))
-
-
-def povey(M):
-    if M < 1:
-        return np.array([])
-    if M == 1:
-        return np.ones(1, float)
-    n = np.arange(0, M)
-    return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85
-
-
-def feature_window_function(window_type, window_size, blackman_coeff):
-    assert window_size > 0
-    if window_type == 'hanning':
-        return np.hanning(window_size)
-    elif window_type == 'sine':
-        return sine(window_size)
-    elif window_type == 'hamming':
-        return np.hamming(window_size)
-    elif window_type == 'povey':
-        return povey(window_size)
-    elif window_type == 'rectangular':
-        return np.ones(window_size)
-    elif window_type == 'blackman':
-        window_func = np.blackman(window_size)
-        if blackman_coeff == 0.42:
-            return window_func
-        else:
-            return window_func - 0.42 + blackman_coeff
-    else:
-        raise ValueError('Invalid window type {}'.format(window_type))
-
-
-def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy):
-    if dither != 0.0:
-        window = func_dither(window, dither)
-    if remove_dc_offset:
-        window = func_remove_dc_offset(window)
-    if raw_energy:
-        log_energy = func_log_energy(window)
-    if preemphasis_coefficient != 0.0:
-        window = func_preemphasis(window, preemphasis_coefficient)
-    window *= window_function
-    if not raw_energy:
-        log_energy = func_log_energy(window)
-    return window, log_energy
-
-
-def extract_window(waveform, blackman_coeff, dither, window_size, window_shift,
-                   preemphasis_coefficient, raw_energy, remove_dc_offset,
-                   snip_edges, window_type, dtype):
-    num_samples = len(waveform)
-    num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges)
-    num_samples_ = (num_frames - 1) * window_shift + window_size
-    if snip_edges:
-        waveform = waveform[:num_samples_]
-    else:
-        offset = window_shift // 2 - window_size // 2
-        waveform = np.concatenate([
-            waveform[-offset - 1::-1],
-            waveform,
-            waveform[:-(offset + num_samples_ - num_samples + 1):-1]
-        ])
-    frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift)
-    frames = frames.astype(dtype)
-    log_enery = np.empty(frames.shape[0], dtype=dtype)
-    for i in range(frames.shape[0]):
-        frames[i], log_enery[i] = process_window(
-            window=frames[i],
-            dither=dither,
-            remove_dc_offset=remove_dc_offset,
-            preemphasis_coefficient=preemphasis_coefficient,
-            window_function=feature_window_function(
-                window_type=window_type,
-                window_size=window_size,
-                blackman_coeff=blackman_coeff
-            ).astype(dtype),
-            raw_energy=raw_energy
-        )
-    return frames, log_enery
-
-# ---------- feature-window ----------
-
-
-# ---------- feature-functions ----------
-
-def compute_spectrum(frames, n):
-    complex_spec = np.fft.rfft(frames, n)
-    return np.absolute(complex_spec)
-
-
-def compute_power_spectrum(frames, n):
-    return np.square(compute_spectrum(frames, n))
-
-
-def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False):
-    num_frames, feat_dim = feat.shape
-    std = 1
-    if center:
-        if num_frames <= window:
-            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
-            if norm_vars:
-                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
-        else:
-            feat1 = feat[:window]
-            feat2 = sliding_window(feat.T, window, 1)
-            feat3 = feat[-window:]
-            mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0)
-            mean2 = feat2.mean(axis=2).T
-            mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
-            mean = np.concatenate([mean1, mean2, mean3])
-            if norm_vars:
-                std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0)
-                std2 = feat2.std(axis=2).T
-                std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
-                std = np.concatenate([std1, std2, std3])
-    else:
-        if num_frames <= min_window:
-            mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
-            if norm_vars:
-                std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
-        else:
-            feat1 = feat[:min_window]
-            mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0)
-            feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:]
-            cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis]
-            mean2 = feat2_cumsum / cumcnt
-            mean = np.concatenate([mean1, mean2])
-            if norm_vars:
-                std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0)
-                feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:]
-                std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2))
-                std = np.concatenate([std1, std2])
-            if num_frames > window:
-                feat3 = sliding_window(feat.T, window, 1)
-                mean3 = feat3.mean(axis=2).T
-                mean = np.concatenate([mean, mean3[1:]])
-                if norm_vars:
-                    std3 = feat3.std(axis=2).T
-                    std = np.concatenate([std, std3[1:]])
-    feat = (feat - mean) / std
-    return feat
-
-# ---------- feature-functions ----------
-
-
-# ---------- mel-computations ----------
-
-def inverse_mel_scale(mel_freq):
-    return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
-
-
-def mel_scale(freq):
-    return 1127.0 * np.log(1.0 + freq / 700.0)
-
-
-def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n):
-    """ Compute Mel banks.
-
-    :param num_bins: Number of triangular mel-frequency bins
-    :param sample_frequency: Waveform data sample frequency
-    :param low_freq: Low cutoff frequency for mel bins
-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
-    :param n: Window size
-    :return: Mel banks.
-    """
-    assert num_bins >= 3, 'Must have at least 3 mel bins'
-    num_fft_bins = n // 2
-
-    nyquist = 0.5 * sample_frequency
-    if high_freq <= 0:
-        high_freq = nyquist + high_freq
-    assert 0 <= low_freq < high_freq <= nyquist
-
-    fft_bin_width = sample_frequency / n
-
-    mel_low_freq = mel_scale(low_freq)
-    mel_high_freq = mel_scale(high_freq)
-    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
-
-    mel_banks = np.zeros([num_bins, num_fft_bins + 1])
-    for i in range(num_bins):
-        left_mel = mel_low_freq + mel_freq_delta * i
-        center_mel = left_mel + mel_freq_delta
-        right_mel = center_mel + mel_freq_delta
-        for j in range(num_fft_bins):
-            mel = mel_scale(fft_bin_width * j)
-            if left_mel < mel < right_mel:
-                if mel <= center_mel:
-                    mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel)
-                else:
-                    mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel)
-    return mel_banks
-
-
-def compute_lifter_coeffs(q, M):
-    """ Compute liftering coefficients (scaling on cepstral coeffs)
-        the zeroth index is C0, which is not affected.
-
-    :param q: Number of lifters
-    :param M: Number of coefficients
-    :return: Lifters.
-    """
-    if M < 1:
-        return np.array([])
-    if M == 1:
-        return np.ones(1, float)
-    n = np.arange(0, M)
-    return 1 + 0.5*np.sin(np.pi*n/q)*q
-
-# ---------- mel-computations ----------
-
-
-# ---------- compute-fbank-feats ----------
-
-def compute_fbank_feats(
-        waveform,
-        blackman_coeff=0.42,
-        dither=1.0,
-        energy_floor=1.0,
-        frame_length=25,
-        frame_shift=10,
-        high_freq=0,
-        low_freq=20,
-        num_mel_bins=23,
-        preemphasis_coefficient=0.97,
-        raw_energy=True,
-        remove_dc_offset=True,
-        round_to_power_of_two=True,
-        sample_frequency=16000,
-        snip_edges=True,
-        use_energy=False,
-        use_log_fbank=True,
-        use_power=True,
-        window_type='povey',
-        dtype=np.float32):
-    """ Compute (log) Mel filter bank energies
-
-    :param waveform: Input waveform.
-    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
-    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
-    :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
-    :param frame_length: Frame length in milliseconds (float, default = 25)
-    :param frame_shift: Frame shift in milliseconds (float, default = 10)
-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
-    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
-    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
-    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
-    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
-    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
-    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
-    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
-    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
-    :param use_energy: Add an extra energy output. (bool, default = false)
-    :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true)
-    :param use_power: If true, use power, else use magnitude. (bool, default = true)
-    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
-    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
-    :return: (Log) Mel filter bank energies.
-    """
-    window_size = int(frame_length * sample_frequency * 0.001)
-    window_shift = int(frame_shift * sample_frequency * 0.001)
-    frames, log_energy = extract_window(
-        waveform=waveform,
-        blackman_coeff=blackman_coeff,
-        dither=dither,
-        window_size=window_size,
-        window_shift=window_shift,
-        preemphasis_coefficient=preemphasis_coefficient,
-        raw_energy=raw_energy,
-        remove_dc_offset=remove_dc_offset,
-        snip_edges=snip_edges,
-        window_type=window_type,
-        dtype=dtype
-    )
-    if round_to_power_of_two:
-        n = 1
-        while n < window_size:
-            n *= 2
-    else:
-        n = window_size
-    if use_power:
-        spectrum = compute_power_spectrum(frames, n)
-    else:
-        spectrum = compute_spectrum(frames, n)
-    mel_banks = compute_mel_banks(
-        num_bins=num_mel_bins,
-        sample_frequency=sample_frequency,
-        low_freq=low_freq,
-        high_freq=high_freq,
-        n=n
-    ).astype(dtype)
-    feat = np.dot(spectrum, mel_banks.T)
-    if use_log_fbank:
-        feat = np.log(feat.clip(min=np.finfo(dtype).eps))
-    if use_energy:
-        if energy_floor > 0.0:
-            log_energy.clip(min=np.math.log(energy_floor))
-        return feat, log_energy
-    return feat
-
-# ---------- compute-fbank-feats ----------
-
-
-# ---------- compute-mfcc-feats ----------
-
-def compute_mfcc_feats(
-        waveform,
-        blackman_coeff=0.42,
-        cepstral_lifter=22,
-        dither=1.0,
-        energy_floor=0.0,
-        frame_length=25,
-        frame_shift=10,
-        high_freq=0,
-        low_freq=20,
-        num_ceps=13,
-        num_mel_bins=23,
-        preemphasis_coefficient=0.97,
-        raw_energy=True,
-        remove_dc_offset=True,
-        round_to_power_of_two=True,
-        sample_frequency=16000,
-        snip_edges=True,
-        use_energy=True,
-        window_type='povey',
-        dtype=np.float32):
-    """ Compute mel-frequency cepstral coefficients
-
-    :param waveform: Input waveform.
-    :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
-    :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22)
-    :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
-    :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0.  Suggested values: 0.1 or 1.0 (float, default = 0)
-    :param frame_length: Frame length in milliseconds (float, default = 25)
-    :param frame_shift: Frame shift in milliseconds (float, default = 10)
-    :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
-    :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
-    :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13)
-    :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
-    :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
-    :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
-    :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
-    :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
-    :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
-    :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length.  If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
-    :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true)
-    :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
-    :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
-    :return: Mel-frequency cespstral coefficients.
-    """
-    feat, log_energy = compute_fbank_feats(
-        waveform=waveform,
-        blackman_coeff=blackman_coeff,
-        dither=dither,
-        energy_floor=energy_floor,
-        frame_length=frame_length,
-        frame_shift=frame_shift,
-        high_freq=high_freq,
-        low_freq=low_freq,
-        num_mel_bins=num_mel_bins,
-        preemphasis_coefficient=preemphasis_coefficient,
-        raw_energy=raw_energy,
-        remove_dc_offset=remove_dc_offset,
-        round_to_power_of_two=round_to_power_of_two,
-        sample_frequency=sample_frequency,
-        snip_edges=snip_edges,
-        use_energy=use_energy,
-        use_log_fbank=True,
-        use_power=True,
-        window_type=window_type,
-        dtype=dtype
-    )
-    feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps]
-    lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype)
-    feat = feat * lifter_coeffs
-    if use_energy:
-        feat[:, 0] = log_energy
-    return feat
-
-# ---------- compute-mfcc-feats ----------
-
-
-# ---------- apply-cmvn-sliding ----------
-
-def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False):
-    """ Apply sliding-window cepstral mean (and optionally variance) normalization
-
-    :param feat: Cepstrum.
-    :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
-    :param window: Window in frames for running average CMN computation (int, default = 600)
-    :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)
-    :param norm_vars: If true, normalize variance to one. (bool, default = false)
-    :return: Normalized cepstrum.
-    """
-    # double-precision
-    feat = apply_cmvn_sliding_internal(
-        feat=feat.astype(np.float64),
-        center=center,
-        window=window,
-        min_window=min_window,
-        norm_vars=norm_vars
-    ).astype(feat.dtype)
-    return feat
-
-# ---------- apply-cmvn-sliding ----------
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
deleted file mode 100644
index 5577be1..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import numpy as np
-
-from .feature import sliding_window
-
-
-# ---------- compute-vad ----------
-
-def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6):
-    """ Apply voice activity detection
-
-    :param log_energy: Log mel energy.
-    :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
-    :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)
-    :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)
-    :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)
-    :return: A vector of boolean that are True if we judge the frame voiced and False otherwise.
-    """
-    assert len(log_energy.shape) == 1
-    assert energy_mean_scale >= 0
-    assert frames_context >= 0
-    assert 0 < proportion_threshold < 1
-    dtype = log_energy.dtype
-    energy_threshold += energy_mean_scale * log_energy.mean()
-    if frames_context > 0:
-        num_frames = len(log_energy)
-        window_size = frames_context * 2 + 1
-        log_energy_pad = np.concatenate([
-            np.zeros(frames_context, dtype=dtype),
-            log_energy,
-            np.zeros(frames_context, dtype=dtype)
-        ])
-        log_energy_window = sliding_window(log_energy_pad, window_size, 1)
-        num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1)
-        den_count = np.ones(num_frames, dtype=dtype) * window_size
-        max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype)
-        den_count[:-(frames_context + 2):-1] = max_den_count
-        den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0)
-        vad = num_count / den_count >= proportion_threshold
-    else:
-        vad = log_energy > energy_threshold
-    return vad
-
-# ---------- compute-vad ----------
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
index f8fe063..b6ff0cd 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
@@ -10,9 +10,10 @@
 import numpy as np
 
 from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
-                    OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
+                    OrtInferSession, TokenIDConverter, get_logger,
                     read_yaml)
 from .utils.postprocess_utils import sentence_postprocess
+from .utils.frontend import WavFrontend
 
 logging = get_logger()
 
@@ -65,7 +66,7 @@
                   wav_content: Union[str, np.ndarray, List[str]]) -> List:
         def load_wav(path: str) -> np.ndarray:
             waveform, _ = librosa.load(path, sr=None)
-            return waveform[None, ...]
+            return waveform
 
         if isinstance(wav_content, np.ndarray):
             return [wav_content]
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
index ed40eb4..aeb085a 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
@@ -2,4 +2,5 @@
 numpy
 onnxruntime
 scipy
-typeguard
\ No newline at end of file
+typeguard
+kaldi-native-fbank
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
new file mode 100644
index 0000000..f973f65
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
@@ -0,0 +1,137 @@
+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+from typeguard import check_argument_types
+import kaldi_native_fbank as knf
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class WavFrontend():
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            cmvn_file: str = None,
+            fs: int = 16000,
+            window: str = 'hamming',
+            n_mels: int = 80,
+            frame_length: int = 25.0,
+            frame_shift: int = 10,
+            filter_length_min: int = -1,
+            filter_length_max: float = -1,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+            dither: float = 1.0
+    ) -> None:
+        check_argument_types()
+
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+
+        self.compute_fbank_feats = knf.OnlineFbank(self.opts)
+
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+
+    def fbank(self,
+              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        self.compute_fbank_feats.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.compute_fbank_feats.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = self.compute_fbank_feats.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append(
+                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n:].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+
+    def load_cmvn(self,) -> np.ndarray:
+        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == '<AddShift>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    add_shift_line = line_item[3:(len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == '<Rescale>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    rescale_line = line_item[3:(len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
index aa9d665..8e220e0 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
@@ -13,7 +13,6 @@
                          SessionOptions, get_available_providers, get_device)
 from typeguard import check_argument_types
 
-from funasr.runtime.python.onnxruntime.paraformer.rapid_paraformer.kaldifeat import compute_fbank_feats
 import warnings
 
 root_dir = Path(__file__).resolve().parent
@@ -120,128 +119,6 @@
             f")"
         )
 
-
-class WavFrontend():
-    """Conventional frontend structure for ASR.
-    """
-
-    def __init__(
-            self,
-            cmvn_file: str = None,
-            fs: int = 16000,
-            window: str = 'hamming',
-            n_mels: int = 80,
-            frame_length: int = 25,
-            frame_shift: int = 10,
-            filter_length_min: int = -1,
-            filter_length_max: float = -1,
-            lfr_m: int = 1,
-            lfr_n: int = 1,
-            dither: float = 1.0
-    ) -> None:
-        check_argument_types()
-
-        self.fs = fs
-        self.window = window
-        self.n_mels = n_mels
-        self.frame_length = frame_length
-        self.frame_shift = frame_shift
-        self.filter_length_min = filter_length_min
-        self.filter_length_max = filter_length_max
-        self.lfr_m = lfr_m
-        self.lfr_n = lfr_n
-        self.cmvn_file = cmvn_file
-        self.dither = dither
-
-        if self.cmvn_file:
-            self.cmvn = self.load_cmvn()
-
-    def fbank(self,
-              input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-        waveform_len = input_content.shape[1]
-        waveform = input_content[0][:waveform_len]
-        waveform = waveform * (1 << 15)
-        mat = compute_fbank_feats(waveform,
-                                  num_mel_bins=self.n_mels,
-                                  frame_length=self.frame_length,
-                                  frame_shift=self.frame_shift,
-                                  dither=self.dither,
-                                  energy_floor=0.0,
-                                  sample_frequency=self.fs,
-                                  window_type=self.window)
-        feat = mat.astype(np.float32)
-        feat_len = np.array(mat.shape[0]).astype(np.int32)
-        return feat, feat_len
-
-    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
-        if self.lfr_m != 1 or self.lfr_n != 1:
-            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
-
-        if self.cmvn_file:
-            feat = self.apply_cmvn(feat)
-
-        feat_len = np.array(feat.shape[0]).astype(np.int32)
-        return feat, feat_len
-
-    @staticmethod
-    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
-        LFR_inputs = []
-
-        T = inputs.shape[0]
-        T_lfr = int(np.ceil(T / lfr_n))
-        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
-        inputs = np.vstack((left_padding, inputs))
-        T = T + (lfr_m - 1) // 2
-        for i in range(T_lfr):
-            if lfr_m <= T - i * lfr_n:
-                LFR_inputs.append(
-                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
-            else:
-                # process last LFR frame
-                num_padding = lfr_m - (T - i * lfr_n)
-                frame = inputs[i * lfr_n:].reshape(-1)
-                for _ in range(num_padding):
-                    frame = np.hstack((frame, inputs[-1]))
-
-                LFR_inputs.append(frame)
-        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
-        return LFR_outputs
-
-    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
-        """
-        Apply CMVN with mvn data
-        """
-        frame, dim = inputs.shape
-        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
-        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
-        inputs = (inputs + means) * vars
-        return inputs
-
-    def load_cmvn(self,) -> np.ndarray:
-        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-
-        means_list = []
-        vars_list = []
-        for i in range(len(lines)):
-            line_item = lines[i].split()
-            if line_item[0] == '<AddShift>':
-                line_item = lines[i + 1].split()
-                if line_item[0] == '<LearnRateCoef>':
-                    add_shift_line = line_item[3:(len(line_item) - 1)]
-                    means_list = list(add_shift_line)
-                    continue
-            elif line_item[0] == '<Rescale>':
-                line_item = lines[i + 1].split()
-                if line_item[0] == '<LearnRateCoef>':
-                    rescale_line = line_item[3:(len(line_item) - 1)]
-                    vars_list = list(rescale_line)
-                    continue
-
-        means = np.array(means_list).astype(np.float64)
-        vars = np.array(vars_list).astype(np.float64)
-        cmvn = np.array([means, vars])
-        return cmvn
 
 
 class Hypothesis(NamedTuple):

--
Gitblit v1.9.1