| | |
| | | from torch import nn |
| | | |
| | | import whisper |
| | | |
| | | # import whisper_timestamped as whisper |
| | | |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | |
| | | @tables.register("model_classes", "Whisper-large-v1") |
| | | @tables.register("model_classes", "Whisper-large-v2") |
| | | @tables.register("model_classes", "Whisper-large-v3") |
| | | @tables.register("model_classes", "Whisper-large-v3-turbo") |
| | | @tables.register("model_classes", "WhisperWarp") |
| | | class WhisperWarp(nn.Module): |
| | | def __init__(self, *args, **kwargs): |
| | |
| | | |
| | | # decode the audio |
| | | options = whisper.DecodingOptions(**kwargs.get("DecodingOptions", {})) |
| | | |
| | | result = whisper.decode(self.model, speech, language='english') |
| | | |
| | | result = whisper.decode(self.model, speech, options=options) |
| | | # result = whisper.transcribe(self.model, speech) |
| | | |
| | | |
| | | results = [] |
| | | result_i = {"key": key[0], "text": result.text} |
| | | |