zhifu gao
2024-04-02 48693b45c021a842ea964c9dc99479b61eac062f
funasr/models/sense_voice/model.py
@@ -73,18 +73,24 @@
        speech = speech.to(device=kwargs["device"])[0, :, :]
        speech_lengths = speech_lengths.to(device=kwargs["device"])
        task = kwargs.get("task", "ASR")
        if isinstance(task, str):
            task = [task]
        task = "".join([f"<|{x}|>" for x in task])
        initial_prompt = kwargs.get("initial_prompt", f"<|startoftranscript|>{task}")
        language = kwargs.get("language", None)
        initial_prompt = kwargs.get("initial_prompt", "<|startoftranscript|><|ASR|>")
        # # detect the spoken language
        # _, probs = self.model.detect_language(speech, initial_prompt=initial_prompt)
        # print(f"Detected language: {max(probs, key=probs.get)}")
        # language = max(probs, key=probs.get)
        # language = language if kwargs.get("language", None) is None else kwargs.get("language")
        language = None if language == "auto" else language
        # if language is None:
        #     # detect the spoken language
        #     _, probs = self.model.detect_language(speech, initial_prompt=initial_prompt)
        #     print(f"Detected language: {max(probs, key=probs.get)}")
        #     language = max(probs, key=probs.get)
        #     language = language if kwargs.get("language", None) is None else kwargs.get("language")
        
        # decode the audio
        prompt = ""
        initial_prompt = kwargs.get("initial_prompt", "<|startoftranscript|><|ASR|>")
        # initial_prompt = kwargs.get("initial_prompt", "<|startoftranscript|><|ASR|>")
        options = whisper.DecodingOptions(language=language, fp16=False, without_timestamps=True, initial_prompt=initial_prompt)
        result = whisper.decode(self.model, speech, options)