python/FunASR-XL.git

			@@ -4,15 +4,16 @@
			from torch.cuda.amp import autocast
			from typing import Union, Dict, List, Tuple, Optional

			from funasr.register import tables
			from funasr.utils import postprocess_utils
			from funasr.utils.datadir_writer import DatadirWriter
			from funasr.models.paraformer.cif_predictor import mae_loss
			from funasr.train_utils.device_funcs import force_gatherable
			from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
			from funasr.models.transformer.utils.nets_utils import make_pad_mask
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank, load_audio_and_text_image_video
			from funasr.utils import postprocess_utils
			from funasr.utils.datadir_writer import DatadirWriter
			from funasr.register import tables
			from funasr.models.ctc.ctc import CTC
			from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank


			@tables.register("model_classes", "monotonicaligner")
			@@ -153,7 +154,7 @@
			meta_data = {}
			# extract fbank feats
			time1 = time.perf_counter()
			audio_list, text_token_int_list = load_audio_and_text_image_video(data_in,
			audio_list, text_token_int_list = load_audio_text_image_video(data_in,
			fs=frontend.fs,
			audio_fs=kwargs.get("fs", 16000),
			data_type=kwargs.get("data_type", "sound"),
			@@ -165,7 +166,8 @@
			meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
			meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000

			speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])
			speech = speech.to(device=kwargs["device"])
			speech_lengths = speech_lengths.to(device=kwargs["device"])

			# Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
			@@ -187,9 +189,12 @@
			text_postprocessed, time_stamp_postprocessed, _ = postprocess_utils.sentence_postprocess(token, timestamp)
			result_i = {"key": key[i], "text": text_postprocessed,
			"timestamp": time_stamp_postprocessed,
			}
			# ibest_writer["token"][key[i]] = " ".join(token)
			ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
			ibest_writer["timestamp_str"][key[i]] = timestamp_str
			}
			results.append(result_i)

			if ibest_writer:
			# ibest_writer["token"][key[i]] = " ".join(token)
			ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
			ibest_writer["timestamp_str"][key[i]] = timestamp_str

			return results, meta_data