游雁
2023-08-30 c2e4e3c2e9be855277d9f4fa9cd0544892ff829a
funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
@@ -4,7 +4,6 @@
import copy
import numpy as np
from typeguard import check_argument_types
import kaldi_native_fbank as knf
root_dir = Path(__file__).resolve().parent
@@ -29,7 +28,6 @@
            dither: float = 1.0,
            **kwargs,
    ) -> None:
        check_argument_types()
        opts = knf.FbankOptions()
        opts.frame_opts.samp_freq = fs
@@ -217,7 +215,7 @@
        frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length)
        # update self.in_cache
        self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):]
        waveforms = np.empty(0, dtype=np.int16)
        waveforms = np.empty(0, dtype=np.float32)
        feats_pad = np.empty(0, dtype=np.float32)
        feats_lens = np.empty(0, dtype=np.int32)
        if frame_num:
@@ -237,7 +235,7 @@
                    mat[i, :] = self.fbank_fn.get_frame(i)
                feat = mat.astype(np.float32)
                feat_len = np.array(mat.shape[0]).astype(np.int32)
                feats.append(mat)
                feats.append(feat)
                feats_lens.append(feat_len)
            waveforms = np.stack(waveforms)
@@ -351,6 +349,28 @@
    return array
class SinusoidalPositionEncoderOnline():
    '''Streaming Positional encoding.
    '''
    def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
        batch_size = positions.shape[0]
        positions = positions.astype(dtype)
        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
        inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
        return encoding.astype(dtype)
    def forward(self, x, start_idx=0):
        batch_size, timesteps, input_dim = x.shape
        positions = np.arange(1, timesteps+1+start_idx)[None, :]
        position_encoding = self.encode(positions, input_dim, x.dtype)
        return x + position_encoding[:, start_idx: start_idx + timesteps]
def test():
    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
    import librosa