speech_asr
2023-03-15 f691014c8a97f2ea27dc72c9d3b374bdd05aa6c9
funasr/models/frontend/wav_frontend.py
@@ -11,6 +11,8 @@
import funasr.models.frontend.eend_ola_feature as eend_ola_feature
from funasr.models.frontend.abs_frontend import AbsFrontend
from modelscope.utils.logger import get_logger
logger = get_logger()
def load_cmvn(cmvn_file):
    with open(cmvn_file, 'r', encoding='utf-8') as f:
@@ -485,13 +487,16 @@
        batch_size = input.size(0)
        feats = []
        feats_lens = []
        logger.info("batch_size: {}".format(batch_size))
        logger.info("input: {}".format(input))
        logger.info("input_lengths: {}".format(input_lengths))
        for i in range(batch_size):
            waveform_length = input_lengths[i]
            waveform = input[i][:waveform_length]
            waveform = waveform.unsqueeze(0).numpy()
            waveform = waveform.numpy()
            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
            mat = eend_ola_feature.transform(mat)
            mat = mat.splice(mat, context_size=self.lfr_m)
            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
            mat = mat[::self.lfr_n]
            mat = torch.from_numpy(mat)
            feat_length = mat.size(0)