python/FunASR-XL.git

			@@ -7,7 +7,11 @@
			import torch.nn.functional as F
			from torch import Tensor
			from torch import nn

			import whisper

			# import whisper_timestamped as whisper

			from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank

			from funasr.register import tables
			@@ -24,6 +28,7 @@
			@tables.register("model_classes", "Whisper-large-v1")
			@tables.register("model_classes", "Whisper-large-v2")
			@tables.register("model_classes", "Whisper-large-v3")
			@tables.register("model_classes", "Whisper-large-v3-turbo")
			@tables.register("model_classes", "WhisperWarp")
			class WhisperWarp(nn.Module):
			def __init__(self, args, *kwargs):
			@@ -108,7 +113,9 @@

			# decode the audio
			options = whisper.DecodingOptions(**kwargs.get("DecodingOptions", {}))
			result = whisper.decode(self.model, speech, options)

			result = whisper.decode(self.model, speech, options=options)
			# result = whisper.transcribe(self.model, speech)

			results = []
			result_i = {"key": key[0], "text": result.text}