python/FunASR-XL.git

			@@ -4,7 +4,6 @@
			import copy

			import numpy as np
			from typeguard import check_argument_types
			import kaldi_native_fbank as knf

			root_dir = Path(__file__).resolve().parent
			@@ -29,7 +28,6 @@
			dither: float = 1.0,
			**kwargs,
			) -> None:
			check_argument_types()

			opts = knf.FbankOptions()
			opts.frame_opts.samp_freq = fs
			@@ -217,7 +215,7 @@
			frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length)
			# update self.in_cache
			self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):]
			waveforms = np.empty(0, dtype=np.int16)
			waveforms = np.empty(0, dtype=np.float32)
			feats_pad = np.empty(0, dtype=np.float32)
			feats_lens = np.empty(0, dtype=np.int32)
			if frame_num:
			@@ -237,7 +235,7 @@
			mat[i, :] = self.fbank_fn.get_frame(i)
			feat = mat.astype(np.float32)
			feat_len = np.array(mat.shape[0]).astype(np.int32)
			feats.append(mat)
			feats.append(feat)
			feats_lens.append(feat_len)

			waveforms = np.stack(waveforms)
			@@ -351,6 +349,28 @@
			return array


			class SinusoidalPositionEncoderOnline():
			'''Streaming Positional encoding.
			'''

			def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
			batch_size = positions.shape[0]
			positions = positions.astype(dtype)
			log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
			inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
			inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
			scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
			encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
			return encoding.astype(dtype)

			def forward(self, x, start_idx=0):
			batch_size, timesteps, input_dim = x.shape
			positions = np.arange(1, timesteps+1+start_idx)[None, :]
			position_encoding = self.encode(positions, input_dim, x.dtype)

			return x + position_encoding[:, start_idx: start_idx + timesteps]


			def test():
			path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
			import librosa