python/FunASR-XL.git

New file
			@@ -0,0 +1,51 @@
			# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita)
			# Licensed under the MIT license.
			#
			# This module is for computing audio features

			import librosa
			import numpy as np


			def transform(Y, dtype=np.float32):
			Y = np.abs(Y)
			n_fft = 2 * (Y.shape[1] - 1)
			sr = 8000
			n_mels = 23
			mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
			Y = np.dot(Y ** 2, mel_basis.T)
			Y = np.log10(np.maximum(Y, 1e-10))
			mean = np.mean(Y, axis=0)
			Y = Y - mean
			return Y.astype(dtype)


			def subsample(Y, T, subsampling=1):
			Y_ss = Y[::subsampling]
			T_ss = T[::subsampling]
			return Y_ss, T_ss


			def splice(Y, context_size=0):
			Y_pad = np.pad(
			Y,
			[(context_size, context_size), (0, 0)],
			'constant')
			Y_spliced = np.lib.stride_tricks.as_strided(
			np.ascontiguousarray(Y_pad),
			(Y.shape[0], Y.shape[1] * (2 * context_size + 1)),
			(Y.itemsize * Y.shape[1], Y.itemsize), writeable=False)
			return Y_spliced


			def stft(
			data,
			frame_size=1024,
			frame_shift=256):
			fft_size = 1 << (frame_size - 1).bit_length()
			if len(data) % frame_shift == 0:
			return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
			hop_length=frame_shift).T[:-1]
			else:
			return librosa.stft(data, n_fft=fft_size, win_length=frame_size,
			hop_length=frame_shift).T