From c2e4e3c2e9be855277d9f4fa9cd0544892ff829a Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 30 八月 2023 09:57:30 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add

---
 funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py |   28 ++++++++++++++++++++++++----
 1 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
index c92db4e..295e7b5 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py
@@ -4,7 +4,6 @@
 import copy
 
 import numpy as np
-from typeguard import check_argument_types
 import kaldi_native_fbank as knf
 
 root_dir = Path(__file__).resolve().parent
@@ -29,7 +28,6 @@
             dither: float = 1.0,
             **kwargs,
     ) -> None:
-        check_argument_types()
 
         opts = knf.FbankOptions()
         opts.frame_opts.samp_freq = fs
@@ -217,7 +215,7 @@
         frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length)
         # update self.in_cache
         self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):]
-        waveforms = np.empty(0, dtype=np.int16)
+        waveforms = np.empty(0, dtype=np.float32)
         feats_pad = np.empty(0, dtype=np.float32)
         feats_lens = np.empty(0, dtype=np.int32)
         if frame_num:
@@ -237,7 +235,7 @@
                     mat[i, :] = self.fbank_fn.get_frame(i)
                 feat = mat.astype(np.float32)
                 feat_len = np.array(mat.shape[0]).astype(np.int32)
-                feats.append(mat)
+                feats.append(feat)
                 feats_lens.append(feat_len)
 
             waveforms = np.stack(waveforms)
@@ -351,6 +349,28 @@
     return array
 
 
+class SinusoidalPositionEncoderOnline():
+    '''Streaming Positional encoding.
+    '''
+
+    def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
+        batch_size = positions.shape[0]
+        positions = positions.astype(dtype)
+        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
+        inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
+        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
+        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
+        return encoding.astype(dtype)
+
+    def forward(self, x, start_idx=0):
+        batch_size, timesteps, input_dim = x.shape
+        positions = np.arange(1, timesteps+1+start_idx)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype)
+
+        return x + position_encoding[:, start_idx: start_idx + timesteps]
+
+
 def test():
     path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
     import librosa

--
Gitblit v1.9.1