From 1aba91a4b8c60926d97de18ec2e9470c11d416b7 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 13 二月 2023 19:55:46 +0800
Subject: [PATCH] export model
---
/dev/null | 43 -------
.gitignore | 1
funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py | 123 --------------------
funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py | 137 ++++++++++++++++++++++
funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py | 5
funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt | 3
funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py | 3
7 files changed, 144 insertions(+), 171 deletions(-)
diff --git a/.gitignore b/.gitignore
index f8b55d7..7a43407 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
.DS_Store
init_model/
*.tar.gz
+test_local/
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
index f1b5c29..d071b07 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
@@ -1,4 +1,3 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
-# @Contact: liekkaskono@163.com
-from .paraformer_onnx import Paraformer
+# @Contact: liekkaskono@163.com
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
deleted file mode 100644
index 261eeb9..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
- Apache License
- Version 2.0, January 2004
- http://www.apache.org/licenses/
-
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
- 1. Definitions.
-
- "License" shall mean the terms and conditions for use, reproduction,
- and distribution as defined by Sections 1 through 9 of this document.
-
- "Licensor" shall mean the copyright owner or entity authorized by
- the copyright owner that is granting the License.
-
- "Legal Entity" shall mean the union of the acting entity and all
- other entities that control, are controlled by, or are under common
- control with that entity. For the purposes of this definition,
- "control" means (i) the power, direct or indirect, to cause the
- direction or management of such entity, whether by contract or
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
- outstanding shares, or (iii) beneficial ownership of such entity.
-
- "You" (or "Your") shall mean an individual or Legal Entity
- exercising permissions granted by this License.
-
- "Source" form shall mean the preferred form for making modifications,
- including but not limited to software source code, documentation
- source, and configuration files.
-
- "Object" form shall mean any form resulting from mechanical
- transformation or translation of a Source form, including but
- not limited to compiled object code, generated documentation,
- and conversions to other media types.
-
- "Work" shall mean the work of authorship, whether in Source or
- Object form, made available under the License, as indicated by a
- copyright notice that is included in or attached to the work
- (an example is provided in the Appendix below).
-
- "Derivative Works" shall mean any work, whether in Source or Object
- form, that is based on (or derived from) the Work and for which the
- editorial revisions, annotations, elaborations, or other modifications
- represent, as a whole, an original work of authorship. For the purposes
- of this License, Derivative Works shall not include works that remain
- separable from, or merely link (or bind by name) to the interfaces of,
- the Work and Derivative Works thereof.
-
- "Contribution" shall mean any work of authorship, including
- the original version of the Work and any modifications or additions
- to that Work or Derivative Works thereof, that is intentionally
- submitted to Licensor for inclusion in the Work by the copyright owner
- or by an individual or Legal Entity authorized to submit on behalf of
- the copyright owner. For the purposes of this definition, "submitted"
- means any form of electronic, verbal, or written communication sent
- to the Licensor or its representatives, including but not limited to
- communication on electronic mailing lists, source code control systems,
- and issue tracking systems that are managed by, or on behalf of, the
- Licensor for the purpose of discussing and improving the Work, but
- excluding communication that is conspicuously marked or otherwise
- designated in writing by the copyright owner as "Not a Contribution."
-
- "Contributor" shall mean Licensor and any individual or Legal Entity
- on behalf of whom a Contribution has been received by Licensor and
- subsequently incorporated within the Work.
-
- 2. Grant of Copyright License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- copyright license to reproduce, prepare Derivative Works of,
- publicly display, publicly perform, sublicense, and distribute the
- Work and such Derivative Works in Source or Object form.
-
- 3. Grant of Patent License. Subject to the terms and conditions of
- this License, each Contributor hereby grants to You a perpetual,
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
- (except as stated in this section) patent license to make, have made,
- use, offer to sell, sell, import, and otherwise transfer the Work,
- where such license applies only to those patent claims licensable
- by such Contributor that are necessarily infringed by their
- Contribution(s) alone or by combination of their Contribution(s)
- with the Work to which such Contribution(s) was submitted. If You
- institute patent litigation against any entity (including a
- cross-claim or counterclaim in a lawsuit) alleging that the Work
- or a Contribution incorporated within the Work constitutes direct
- or contributory patent infringement, then any patent licenses
- granted to You under this License for that Work shall terminate
- as of the date such litigation is filed.
-
- 4. Redistribution. You may reproduce and distribute copies of the
- Work or Derivative Works thereof in any medium, with or without
- modifications, and in Source or Object form, provided that You
- meet the following conditions:
-
- (a) You must give any other recipients of the Work or
- Derivative Works a copy of this License; and
-
- (b) You must cause any modified files to carry prominent notices
- stating that You changed the files; and
-
- (c) You must retain, in the Source form of any Derivative Works
- that You distribute, all copyright, patent, trademark, and
- attribution notices from the Source form of the Work,
- excluding those notices that do not pertain to any part of
- the Derivative Works; and
-
- (d) If the Work includes a "NOTICE" text file as part of its
- distribution, then any Derivative Works that You distribute must
- include a readable copy of the attribution notices contained
- within such NOTICE file, excluding those notices that do not
- pertain to any part of the Derivative Works, in at least one
- of the following places: within a NOTICE text file distributed
- as part of the Derivative Works; within the Source form or
- documentation, if provided along with the Derivative Works; or,
- within a display generated by the Derivative Works, if and
- wherever such third-party notices normally appear. The contents
- of the NOTICE file are for informational purposes only and
- do not modify the License. You may add Your own attribution
- notices within Derivative Works that You distribute, alongside
- or as an addendum to the NOTICE text from the Work, provided
- that such additional attribution notices cannot be construed
- as modifying the License.
-
- You may add Your own copyright statement to Your modifications and
- may provide additional or different license terms and conditions
- for use, reproduction, or distribution of Your modifications, or
- for any such Derivative Works as a whole, provided Your use,
- reproduction, and distribution of the Work otherwise complies with
- the conditions stated in this License.
-
- 5. Submission of Contributions. Unless You explicitly state otherwise,
- any Contribution intentionally submitted for inclusion in the Work
- by You to the Licensor shall be under the terms and conditions of
- this License, without any additional terms or conditions.
- Notwithstanding the above, nothing herein shall supersede or modify
- the terms of any separate license agreement you may have executed
- with Licensor regarding such Contributions.
-
- 6. Trademarks. This License does not grant permission to use the trade
- names, trademarks, service marks, or product names of the Licensor,
- except as required for reasonable and customary use in describing the
- origin of the Work and reproducing the content of the NOTICE file.
-
- 7. Disclaimer of Warranty. Unless required by applicable law or
- agreed to in writing, Licensor provides the Work (and each
- Contributor provides its Contributions) on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
- implied, including, without limitation, any warranties or conditions
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
- PARTICULAR PURPOSE. You are solely responsible for determining the
- appropriateness of using or redistributing the Work and assume any
- risks associated with Your exercise of permissions under this License.
-
- 8. Limitation of Liability. In no event and under no legal theory,
- whether in tort (including negligence), contract, or otherwise,
- unless required by applicable law (such as deliberate and grossly
- negligent acts) or agreed to in writing, shall any Contributor be
- liable to You for damages, including any direct, indirect, special,
- incidental, or consequential damages of any character arising as a
- result of this License or out of the use or inability to use the
- Work (including but not limited to damages for loss of goodwill,
- work stoppage, computer failure or malfunction, or any and all
- other commercial damages or losses), even if such Contributor
- has been advised of the possibility of such damages.
-
- 9. Accepting Warranty or Additional Liability. While redistributing
- the Work or Derivative Works thereof, You may choose to offer,
- and charge a fee for, acceptance of support, warranty, indemnity,
- or other liability obligations and/or rights consistent with this
- License. However, in accepting such obligations, You may act only
- on Your own behalf and on Your sole responsibility, not on behalf
- of any other Contributor, and only if You agree to indemnify,
- defend, and hold each Contributor harmless for any liability
- incurred by, or claims asserted against, such Contributor by reason
- of your accepting any such warranty or additional liability.
-
- END OF TERMS AND CONDITIONS
-
- APPENDIX: How to apply the Apache License to your work.
-
- To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "[]"
- replaced with your own identifying information. (Don't include
- the brackets!) The text should be enclosed in the appropriate
- comment syntax for the file format. We also recommend that a
- file or class name and description of purpose be included on the
- same "printed page" as the copyright notice for easier
- identification within third-party archives.
-
- Copyright [yyyy] [name of copyright owner]
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
deleted file mode 100644
index f9cf273..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# -*- encoding: utf-8 -*-
-from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding
-from .ivector import compute_vad
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
deleted file mode 100644
index a6c6a6c..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/feature.py
+++ /dev/null
@@ -1,459 +0,0 @@
-import numpy as np
-from scipy.fftpack import dct
-
-
-# ---------- feature-window ----------
-
-def sliding_window(x, window_size, window_shift):
- shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size)
- strides = x.strides + (x.strides[-1],)
- return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift]
-
-
-def func_num_frames(num_samples, window_size, window_shift, snip_edges):
- if snip_edges:
- if num_samples < window_size:
- return 0
- else:
- return 1 + ((num_samples - window_size) // window_shift)
- else:
- return (num_samples + (window_shift // 2)) // window_shift
-
-
-def func_dither(waveform, dither_value):
- if dither_value == 0.0:
- return waveform
- waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value
- return waveform
-
-
-def func_remove_dc_offset(waveform):
- return waveform - np.mean(waveform)
-
-
-def func_log_energy(waveform):
- return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps))
-
-
-def func_preemphasis(waveform, preemph_coeff):
- if preemph_coeff == 0.0:
- return waveform
- assert 0 < preemph_coeff <= 1
- waveform[1:] -= preemph_coeff * waveform[:-1]
- waveform[0] -= preemph_coeff * waveform[0]
- return waveform
-
-
-def sine(M):
- if M < 1:
- return np.array([])
- if M == 1:
- return np.ones(1, float)
- n = np.arange(0, M)
- return np.sin(np.pi*n/(M-1))
-
-
-def povey(M):
- if M < 1:
- return np.array([])
- if M == 1:
- return np.ones(1, float)
- n = np.arange(0, M)
- return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85
-
-
-def feature_window_function(window_type, window_size, blackman_coeff):
- assert window_size > 0
- if window_type == 'hanning':
- return np.hanning(window_size)
- elif window_type == 'sine':
- return sine(window_size)
- elif window_type == 'hamming':
- return np.hamming(window_size)
- elif window_type == 'povey':
- return povey(window_size)
- elif window_type == 'rectangular':
- return np.ones(window_size)
- elif window_type == 'blackman':
- window_func = np.blackman(window_size)
- if blackman_coeff == 0.42:
- return window_func
- else:
- return window_func - 0.42 + blackman_coeff
- else:
- raise ValueError('Invalid window type {}'.format(window_type))
-
-
-def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy):
- if dither != 0.0:
- window = func_dither(window, dither)
- if remove_dc_offset:
- window = func_remove_dc_offset(window)
- if raw_energy:
- log_energy = func_log_energy(window)
- if preemphasis_coefficient != 0.0:
- window = func_preemphasis(window, preemphasis_coefficient)
- window *= window_function
- if not raw_energy:
- log_energy = func_log_energy(window)
- return window, log_energy
-
-
-def extract_window(waveform, blackman_coeff, dither, window_size, window_shift,
- preemphasis_coefficient, raw_energy, remove_dc_offset,
- snip_edges, window_type, dtype):
- num_samples = len(waveform)
- num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges)
- num_samples_ = (num_frames - 1) * window_shift + window_size
- if snip_edges:
- waveform = waveform[:num_samples_]
- else:
- offset = window_shift // 2 - window_size // 2
- waveform = np.concatenate([
- waveform[-offset - 1::-1],
- waveform,
- waveform[:-(offset + num_samples_ - num_samples + 1):-1]
- ])
- frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift)
- frames = frames.astype(dtype)
- log_enery = np.empty(frames.shape[0], dtype=dtype)
- for i in range(frames.shape[0]):
- frames[i], log_enery[i] = process_window(
- window=frames[i],
- dither=dither,
- remove_dc_offset=remove_dc_offset,
- preemphasis_coefficient=preemphasis_coefficient,
- window_function=feature_window_function(
- window_type=window_type,
- window_size=window_size,
- blackman_coeff=blackman_coeff
- ).astype(dtype),
- raw_energy=raw_energy
- )
- return frames, log_enery
-
-# ---------- feature-window ----------
-
-
-# ---------- feature-functions ----------
-
-def compute_spectrum(frames, n):
- complex_spec = np.fft.rfft(frames, n)
- return np.absolute(complex_spec)
-
-
-def compute_power_spectrum(frames, n):
- return np.square(compute_spectrum(frames, n))
-
-
-def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False):
- num_frames, feat_dim = feat.shape
- std = 1
- if center:
- if num_frames <= window:
- mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
- if norm_vars:
- std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
- else:
- feat1 = feat[:window]
- feat2 = sliding_window(feat.T, window, 1)
- feat3 = feat[-window:]
- mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0)
- mean2 = feat2.mean(axis=2).T
- mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
- mean = np.concatenate([mean1, mean2, mean3])
- if norm_vars:
- std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0)
- std2 = feat2.std(axis=2).T
- std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0)
- std = np.concatenate([std1, std2, std3])
- else:
- if num_frames <= min_window:
- mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0)
- if norm_vars:
- std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0)
- else:
- feat1 = feat[:min_window]
- mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0)
- feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:]
- cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis]
- mean2 = feat2_cumsum / cumcnt
- mean = np.concatenate([mean1, mean2])
- if norm_vars:
- std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0)
- feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:]
- std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2))
- std = np.concatenate([std1, std2])
- if num_frames > window:
- feat3 = sliding_window(feat.T, window, 1)
- mean3 = feat3.mean(axis=2).T
- mean = np.concatenate([mean, mean3[1:]])
- if norm_vars:
- std3 = feat3.std(axis=2).T
- std = np.concatenate([std, std3[1:]])
- feat = (feat - mean) / std
- return feat
-
-# ---------- feature-functions ----------
-
-
-# ---------- mel-computations ----------
-
-def inverse_mel_scale(mel_freq):
- return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0)
-
-
-def mel_scale(freq):
- return 1127.0 * np.log(1.0 + freq / 700.0)
-
-
-def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n):
- """ Compute Mel banks.
-
- :param num_bins: Number of triangular mel-frequency bins
- :param sample_frequency: Waveform data sample frequency
- :param low_freq: Low cutoff frequency for mel bins
- :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
- :param n: Window size
- :return: Mel banks.
- """
- assert num_bins >= 3, 'Must have at least 3 mel bins'
- num_fft_bins = n // 2
-
- nyquist = 0.5 * sample_frequency
- if high_freq <= 0:
- high_freq = nyquist + high_freq
- assert 0 <= low_freq < high_freq <= nyquist
-
- fft_bin_width = sample_frequency / n
-
- mel_low_freq = mel_scale(low_freq)
- mel_high_freq = mel_scale(high_freq)
- mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
-
- mel_banks = np.zeros([num_bins, num_fft_bins + 1])
- for i in range(num_bins):
- left_mel = mel_low_freq + mel_freq_delta * i
- center_mel = left_mel + mel_freq_delta
- right_mel = center_mel + mel_freq_delta
- for j in range(num_fft_bins):
- mel = mel_scale(fft_bin_width * j)
- if left_mel < mel < right_mel:
- if mel <= center_mel:
- mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel)
- else:
- mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel)
- return mel_banks
-
-
-def compute_lifter_coeffs(q, M):
- """ Compute liftering coefficients (scaling on cepstral coeffs)
- the zeroth index is C0, which is not affected.
-
- :param q: Number of lifters
- :param M: Number of coefficients
- :return: Lifters.
- """
- if M < 1:
- return np.array([])
- if M == 1:
- return np.ones(1, float)
- n = np.arange(0, M)
- return 1 + 0.5*np.sin(np.pi*n/q)*q
-
-# ---------- mel-computations ----------
-
-
-# ---------- compute-fbank-feats ----------
-
-def compute_fbank_feats(
- waveform,
- blackman_coeff=0.42,
- dither=1.0,
- energy_floor=1.0,
- frame_length=25,
- frame_shift=10,
- high_freq=0,
- low_freq=20,
- num_mel_bins=23,
- preemphasis_coefficient=0.97,
- raw_energy=True,
- remove_dc_offset=True,
- round_to_power_of_two=True,
- sample_frequency=16000,
- snip_edges=True,
- use_energy=False,
- use_log_fbank=True,
- use_power=True,
- window_type='povey',
- dtype=np.float32):
- """ Compute (log) Mel filter bank energies
-
- :param waveform: Input waveform.
- :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
- :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
- :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)
- :param frame_length: Frame length in milliseconds (float, default = 25)
- :param frame_shift: Frame shift in milliseconds (float, default = 10)
- :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
- :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
- :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
- :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
- :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
- :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
- :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
- :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
- :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
- :param use_energy: Add an extra energy output. (bool, default = false)
- :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true)
- :param use_power: If true, use power, else use magnitude. (bool, default = true)
- :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
- :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
- :return: (Log) Mel filter bank energies.
- """
- window_size = int(frame_length * sample_frequency * 0.001)
- window_shift = int(frame_shift * sample_frequency * 0.001)
- frames, log_energy = extract_window(
- waveform=waveform,
- blackman_coeff=blackman_coeff,
- dither=dither,
- window_size=window_size,
- window_shift=window_shift,
- preemphasis_coefficient=preemphasis_coefficient,
- raw_energy=raw_energy,
- remove_dc_offset=remove_dc_offset,
- snip_edges=snip_edges,
- window_type=window_type,
- dtype=dtype
- )
- if round_to_power_of_two:
- n = 1
- while n < window_size:
- n *= 2
- else:
- n = window_size
- if use_power:
- spectrum = compute_power_spectrum(frames, n)
- else:
- spectrum = compute_spectrum(frames, n)
- mel_banks = compute_mel_banks(
- num_bins=num_mel_bins,
- sample_frequency=sample_frequency,
- low_freq=low_freq,
- high_freq=high_freq,
- n=n
- ).astype(dtype)
- feat = np.dot(spectrum, mel_banks.T)
- if use_log_fbank:
- feat = np.log(feat.clip(min=np.finfo(dtype).eps))
- if use_energy:
- if energy_floor > 0.0:
- log_energy.clip(min=np.math.log(energy_floor))
- return feat, log_energy
- return feat
-
-# ---------- compute-fbank-feats ----------
-
-
-# ---------- compute-mfcc-feats ----------
-
-def compute_mfcc_feats(
- waveform,
- blackman_coeff=0.42,
- cepstral_lifter=22,
- dither=1.0,
- energy_floor=0.0,
- frame_length=25,
- frame_shift=10,
- high_freq=0,
- low_freq=20,
- num_ceps=13,
- num_mel_bins=23,
- preemphasis_coefficient=0.97,
- raw_energy=True,
- remove_dc_offset=True,
- round_to_power_of_two=True,
- sample_frequency=16000,
- snip_edges=True,
- use_energy=True,
- window_type='povey',
- dtype=np.float32):
- """ Compute mel-frequency cepstral coefficients
-
- :param waveform: Input waveform.
- :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42)
- :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22)
- :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1)
- :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0)
- :param frame_length: Frame length in milliseconds (float, default = 25)
- :param frame_shift: Frame shift in milliseconds (float, default = 10)
- :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0)
- :param low_freq: Low cutoff frequency for mel bins (float, default = 20)
- :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13)
- :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23)
- :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97)
- :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true)
- :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true)
- :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true)
- :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000)
- :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true)
- :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true)
- :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey")
- :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32)
- :return: Mel-frequency cespstral coefficients.
- """
- feat, log_energy = compute_fbank_feats(
- waveform=waveform,
- blackman_coeff=blackman_coeff,
- dither=dither,
- energy_floor=energy_floor,
- frame_length=frame_length,
- frame_shift=frame_shift,
- high_freq=high_freq,
- low_freq=low_freq,
- num_mel_bins=num_mel_bins,
- preemphasis_coefficient=preemphasis_coefficient,
- raw_energy=raw_energy,
- remove_dc_offset=remove_dc_offset,
- round_to_power_of_two=round_to_power_of_two,
- sample_frequency=sample_frequency,
- snip_edges=snip_edges,
- use_energy=use_energy,
- use_log_fbank=True,
- use_power=True,
- window_type=window_type,
- dtype=dtype
- )
- feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps]
- lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype)
- feat = feat * lifter_coeffs
- if use_energy:
- feat[:, 0] = log_energy
- return feat
-
-# ---------- compute-mfcc-feats ----------
-
-
-# ---------- apply-cmvn-sliding ----------
-
-def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False):
- """ Apply sliding-window cepstral mean (and optionally variance) normalization
-
- :param feat: Cepstrum.
- :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
- :param window: Window in frames for running average CMN computation (int, default = 600)
- :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100)
- :param norm_vars: If true, normalize variance to one. (bool, default = false)
- :return: Normalized cepstrum.
- """
- # double-precision
- feat = apply_cmvn_sliding_internal(
- feat=feat.astype(np.float64),
- center=center,
- window=window,
- min_window=min_window,
- norm_vars=norm_vars
- ).astype(feat.dtype)
- return feat
-
-# ---------- apply-cmvn-sliding ----------
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
deleted file mode 100644
index 5577be1..0000000
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/kaldifeat/ivector.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import numpy as np
-
-from .feature import sliding_window
-
-
-# ---------- compute-vad ----------
-
-def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6):
- """ Apply voice activity detection
-
- :param log_energy: Log mel energy.
- :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5)
- :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5)
- :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0)
- :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6)
- :return: A vector of boolean that are True if we judge the frame voiced and False otherwise.
- """
- assert len(log_energy.shape) == 1
- assert energy_mean_scale >= 0
- assert frames_context >= 0
- assert 0 < proportion_threshold < 1
- dtype = log_energy.dtype
- energy_threshold += energy_mean_scale * log_energy.mean()
- if frames_context > 0:
- num_frames = len(log_energy)
- window_size = frames_context * 2 + 1
- log_energy_pad = np.concatenate([
- np.zeros(frames_context, dtype=dtype),
- log_energy,
- np.zeros(frames_context, dtype=dtype)
- ])
- log_energy_window = sliding_window(log_energy_pad, window_size, 1)
- num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1)
- den_count = np.ones(num_frames, dtype=dtype) * window_size
- max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype)
- den_count[:-(frames_context + 2):-1] = max_den_count
- den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0)
- vad = num_count / den_count >= proportion_threshold
- else:
- vad = log_energy > energy_threshold
- return vad
-
-# ---------- compute-vad ----------
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
index f8fe063..b6ff0cd 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
@@ -10,9 +10,10 @@
import numpy as np
from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
- OrtInferSession, TokenIDConverter, WavFrontend, get_logger,
+ OrtInferSession, TokenIDConverter, get_logger,
read_yaml)
from .utils.postprocess_utils import sentence_postprocess
+from .utils.frontend import WavFrontend
logging = get_logger()
@@ -65,7 +66,7 @@
wav_content: Union[str, np.ndarray, List[str]]) -> List:
def load_wav(path: str) -> np.ndarray:
waveform, _ = librosa.load(path, sr=None)
- return waveform[None, ...]
+ return waveform
if isinstance(wav_content, np.ndarray):
return [wav_content]
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
index ed40eb4..aeb085a 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
@@ -2,4 +2,5 @@
numpy
onnxruntime
scipy
-typeguard
\ No newline at end of file
+typeguard
+kaldi-native-fbank
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
new file mode 100644
index 0000000..f973f65
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
@@ -0,0 +1,137 @@
+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+from typeguard import check_argument_types
+import kaldi_native_fbank as knf
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class WavFrontend():
+ """Conventional frontend structure for ASR.
+ """
+
+ def __init__(
+ self,
+ cmvn_file: str = None,
+ fs: int = 16000,
+ window: str = 'hamming',
+ n_mels: int = 80,
+ frame_length: int = 25.0,
+ frame_shift: int = 10,
+ filter_length_min: int = -1,
+ filter_length_max: float = -1,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ dither: float = 1.0
+ ) -> None:
+ check_argument_types()
+
+ opts = knf.FbankOptions()
+ opts.frame_opts.samp_freq = fs
+ opts.frame_opts.dither = dither
+ opts.frame_opts.window_type = window
+ opts.frame_opts.frame_shift_ms = float(frame_shift)
+ opts.frame_opts.frame_length_ms = float(frame_length)
+ opts.mel_opts.num_bins = n_mels
+ opts.energy_floor = 0
+ opts.frame_opts.snip_edges = True
+ opts.mel_opts.debug_mel = False
+ self.opts = opts
+
+ self.compute_fbank_feats = knf.OnlineFbank(self.opts)
+
+ self.filter_length_min = filter_length_min
+ self.filter_length_max = filter_length_max
+ self.lfr_m = lfr_m
+ self.lfr_n = lfr_n
+ self.cmvn_file = cmvn_file
+
+ if self.cmvn_file:
+ self.cmvn = self.load_cmvn()
+
+ def fbank(self,
+ waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ waveform = waveform * (1 << 15)
+ self.compute_fbank_feats.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+ frames = self.compute_fbank_feats.num_frames_ready
+ mat = np.empty([frames, self.opts.mel_opts.num_bins])
+ for i in range(frames):
+ mat[i, :] = self.compute_fbank_feats.get_frame(i)
+ feat = mat.astype(np.float32)
+ feat_len = np.array(mat.shape[0]).astype(np.int32)
+ return feat, feat_len
+
+ def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ if self.lfr_m != 1 or self.lfr_n != 1:
+ feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+
+ if self.cmvn_file:
+ feat = self.apply_cmvn(feat)
+
+ feat_len = np.array(feat.shape[0]).astype(np.int32)
+ return feat, feat_len
+
+ @staticmethod
+ def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+ LFR_inputs = []
+
+ T = inputs.shape[0]
+ T_lfr = int(np.ceil(T / lfr_n))
+ left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+ inputs = np.vstack((left_padding, inputs))
+ T = T + (lfr_m - 1) // 2
+ for i in range(T_lfr):
+ if lfr_m <= T - i * lfr_n:
+ LFR_inputs.append(
+ (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
+ else:
+ # process last LFR frame
+ num_padding = lfr_m - (T - i * lfr_n)
+ frame = inputs[i * lfr_n:].reshape(-1)
+ for _ in range(num_padding):
+ frame = np.hstack((frame, inputs[-1]))
+
+ LFR_inputs.append(frame)
+ LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+ return LFR_outputs
+
+ def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+ """
+ Apply CMVN with mvn data
+ """
+ frame, dim = inputs.shape
+ means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+ vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+ inputs = (inputs + means) * vars
+ return inputs
+
+ def load_cmvn(self,) -> np.ndarray:
+ with open(self.cmvn_file, 'r', encoding='utf-8') as f:
+ lines = f.readlines()
+
+ means_list = []
+ vars_list = []
+ for i in range(len(lines)):
+ line_item = lines[i].split()
+ if line_item[0] == '<AddShift>':
+ line_item = lines[i + 1].split()
+ if line_item[0] == '<LearnRateCoef>':
+ add_shift_line = line_item[3:(len(line_item) - 1)]
+ means_list = list(add_shift_line)
+ continue
+ elif line_item[0] == '<Rescale>':
+ line_item = lines[i + 1].split()
+ if line_item[0] == '<LearnRateCoef>':
+ rescale_line = line_item[3:(len(line_item) - 1)]
+ vars_list = list(rescale_line)
+ continue
+
+ means = np.array(means_list).astype(np.float64)
+ vars = np.array(vars_list).astype(np.float64)
+ cmvn = np.array([means, vars])
+ return cmvn
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
index aa9d665..8e220e0 100644
--- a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
@@ -13,7 +13,6 @@
SessionOptions, get_available_providers, get_device)
from typeguard import check_argument_types
-from funasr.runtime.python.onnxruntime.paraformer.rapid_paraformer.kaldifeat import compute_fbank_feats
import warnings
root_dir = Path(__file__).resolve().parent
@@ -120,128 +119,6 @@
f")"
)
-
-class WavFrontend():
- """Conventional frontend structure for ASR.
- """
-
- def __init__(
- self,
- cmvn_file: str = None,
- fs: int = 16000,
- window: str = 'hamming',
- n_mels: int = 80,
- frame_length: int = 25,
- frame_shift: int = 10,
- filter_length_min: int = -1,
- filter_length_max: float = -1,
- lfr_m: int = 1,
- lfr_n: int = 1,
- dither: float = 1.0
- ) -> None:
- check_argument_types()
-
- self.fs = fs
- self.window = window
- self.n_mels = n_mels
- self.frame_length = frame_length
- self.frame_shift = frame_shift
- self.filter_length_min = filter_length_min
- self.filter_length_max = filter_length_max
- self.lfr_m = lfr_m
- self.lfr_n = lfr_n
- self.cmvn_file = cmvn_file
- self.dither = dither
-
- if self.cmvn_file:
- self.cmvn = self.load_cmvn()
-
- def fbank(self,
- input_content: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- waveform_len = input_content.shape[1]
- waveform = input_content[0][:waveform_len]
- waveform = waveform * (1 << 15)
- mat = compute_fbank_feats(waveform,
- num_mel_bins=self.n_mels,
- frame_length=self.frame_length,
- frame_shift=self.frame_shift,
- dither=self.dither,
- energy_floor=0.0,
- sample_frequency=self.fs,
- window_type=self.window)
- feat = mat.astype(np.float32)
- feat_len = np.array(mat.shape[0]).astype(np.int32)
- return feat, feat_len
-
- def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- if self.lfr_m != 1 or self.lfr_n != 1:
- feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
-
- if self.cmvn_file:
- feat = self.apply_cmvn(feat)
-
- feat_len = np.array(feat.shape[0]).astype(np.int32)
- return feat, feat_len
-
- @staticmethod
- def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
- LFR_inputs = []
-
- T = inputs.shape[0]
- T_lfr = int(np.ceil(T / lfr_n))
- left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
- inputs = np.vstack((left_padding, inputs))
- T = T + (lfr_m - 1) // 2
- for i in range(T_lfr):
- if lfr_m <= T - i * lfr_n:
- LFR_inputs.append(
- (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
- else:
- # process last LFR frame
- num_padding = lfr_m - (T - i * lfr_n)
- frame = inputs[i * lfr_n:].reshape(-1)
- for _ in range(num_padding):
- frame = np.hstack((frame, inputs[-1]))
-
- LFR_inputs.append(frame)
- LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
- return LFR_outputs
-
- def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
- """
- Apply CMVN with mvn data
- """
- frame, dim = inputs.shape
- means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
- vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
- inputs = (inputs + means) * vars
- return inputs
-
- def load_cmvn(self,) -> np.ndarray:
- with open(self.cmvn_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
-
- means_list = []
- vars_list = []
- for i in range(len(lines)):
- line_item = lines[i].split()
- if line_item[0] == '<AddShift>':
- line_item = lines[i + 1].split()
- if line_item[0] == '<LearnRateCoef>':
- add_shift_line = line_item[3:(len(line_item) - 1)]
- means_list = list(add_shift_line)
- continue
- elif line_item[0] == '<Rescale>':
- line_item = lines[i + 1].split()
- if line_item[0] == '<LearnRateCoef>':
- rescale_line = line_item[3:(len(line_item) - 1)]
- vars_list = list(rescale_line)
- continue
-
- means = np.array(means_list).astype(np.float64)
- vars = np.array(vars_list).astype(np.float64)
- cmvn = np.array([means, vars])
- return cmvn
class Hypothesis(NamedTuple):
--
Gitblit v1.9.1