python/FunASR-XL.git

			@@ -5,17 +5,15 @@
			from typing import Union, Dict, List, Tuple, Optional

			from funasr.models.paraformer.cif_predictor import mae_loss
			from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
			from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list
			from funasr.metrics.compute_acc import th_accuracy
			from funasr.train_utils.device_funcs import force_gatherable
			from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
			from funasr.models.transformer.utils.nets_utils import make_pad_mask
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			from funasr.utils import postprocess_utils
			from funasr.utils.datadir_writer import DatadirWriter
			from funasr.register import tables
			from funasr.models.ctc.ctc import CTC
			from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank, load_audio_and_text_image_video

			from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank


			@tables.register("model_classes", "monotonicaligner")
			@@ -25,7 +23,6 @@
			Achieving timestamp prediction while recognizing with non-autoregressive end-to-end ASR model
			https://arxiv.org/abs/2301.12343
			"""

			def __init__(
			self,
			input_size: int = 80,
			@@ -41,7 +38,6 @@
			length_normalized_loss: bool = False,
			**kwargs,
			):

			super().__init__()

			if specaug is not None:
			@@ -155,11 +151,10 @@
			frontend=None,
			**kwargs,
			):

			meta_data = {}
			# extract fbank feats
			time1 = time.perf_counter()
			audio_list, text_token_int_list = load_audio_and_text_image_video(data_in,
			audio_list, text_token_int_list = load_audio_text_image_video(data_in,
			fs=frontend.fs,
			audio_fs=kwargs.get("fs", 16000),
			data_type=kwargs.get("data_type", "sound"),
			@@ -171,7 +166,8 @@
			meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
			meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000

			speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])
			speech = speech.to(device=kwargs["device"])
			speech_lengths = speech_lengths.to(device=kwargs["device"])

			# Encoder
			encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
			@@ -190,13 +186,15 @@
			timestamp_str, timestamp = ts_prediction_lfr6_standard(us_alpha[:encoder_out_lens[i] * 3],
			us_peak[:encoder_out_lens[i] * 3],
			copy.copy(token))
			text_postprocessed, time_stamp_postprocessed, word_lists = postprocess_utils.sentence_postprocess(
			token, timestamp)
			text_postprocessed, time_stamp_postprocessed, _ = postprocess_utils.sentence_postprocess(token, timestamp)
			result_i = {"key": key[i], "text": text_postprocessed,
			"timestamp": time_stamp_postprocessed,
			}
			# ibest_writer["token"][key[i]] = " ".join(token)
			ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
			ibest_writer["timestamp_str"][key[i]] = timestamp_str
			}
			results.append(result_i)

			if ibest_writer:
			# ibest_writer["token"][key[i]] = " ".join(token)
			ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
			ibest_writer["timestamp_str"][key[i]] = timestamp_str

			return results, meta_data