python/FunASR-XL.git

			@@ -1,14 +1,13 @@
			#!/usr/bin/env python3
			# -- encoding: utf-8 --
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)

			import logging
			from typing import Dict
			from typing import List
			from typing import Optional
			from typing import Tuple
			from typing import Union
			import tempfile
			import codecs
			import requests
			import re
			import copy
			import torch
			import torch.nn as nn
			@@ -23,7 +22,7 @@

			from funasr.models.paraformer.search import Hypothesis

			from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
			from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
			from funasr.utils import postprocess_utils
			from funasr.utils.datadir_writer import DatadirWriter
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			@@ -243,7 +242,7 @@
			else:
			# extract fbank feats
			time1 = time.perf_counter()
			audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
			audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
			time2 = time.perf_counter()
			meta_data["load_data"] = f"{time2 - time1:0.3f}"
			speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
			@@ -252,7 +251,8 @@
			meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
			meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000

			speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])
			speech = speech.to(device=kwargs["device"])
			speech_lengths = speech_lengths.to(device=kwargs["device"])

			# Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)