add
游雁
2024-04-12 da340e6a6cf8680878a083f5c1b18775dc0c686f
add
3个文件已修改
49 ■■■■■ 已修改文件
examples/industrial_data_pretraining/sense_voice/demo.py 15 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/auto/auto_model.py 3 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/sense_voice/model.py 31 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/sense_voice/demo.py
@@ -9,8 +9,15 @@
                  vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
                  vad_kwargs={"max_single_segment_time": 30000},
                  )
task = "ASR"
language = None
input_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
res = model.generate(task=task, language=language, input=input_wav, batch_size_s=0,)
input_wav = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/aed_ser/asr_bgm.wav"
DecodingOptions = {
    "task": ("ASR", "AED", "SER"),
    "language": "auto",
    "fp16": True,
    }
res = model.generate(input=input_wav, batch_size_s=0, DecodingOptions=DecodingOptions)
print(res)
funasr/auto/auto_model.py
@@ -211,6 +211,9 @@
            else:
                print(f"error, init_param does not exist!: {init_param}")
        
        # fp16
        if kwargs.get("fp16", False):
            model.to(torch.float16)
        return model, kwargs
    
    def __call__(self, *args, **cfg):
funasr/models/sense_voice/model.py
@@ -73,28 +73,27 @@
        speech = speech.to(device=kwargs["device"])[0, :, :]
        speech_lengths = speech_lengths.to(device=kwargs["device"])
        task = kwargs.get("task", "ASR")
        DecodingOptions = kwargs.get("DecodingOptions", {})
        task = DecodingOptions.get("task", "ASR")
        if isinstance(task, str):
            task = [task]
        task = "".join([f"<|{x}|>" for x in task])
        initial_prompt = kwargs.get("initial_prompt", f"<|startoftranscript|>{task}")
        language = kwargs.get("language", None)
        DecodingOptions["initial_prompt"] = initial_prompt
        language = DecodingOptions.get("language", None)
        language = None if language == "auto" else language
        # if language is None:
        #     # detect the spoken language
        #     _, probs = self.model.detect_language(speech, initial_prompt=initial_prompt)
        #     print(f"Detected language: {max(probs, key=probs.get)}")
        #     language = max(probs, key=probs.get)
        #     language = language if kwargs.get("language", None) is None else kwargs.get("language")
        # decode the audio
        # initial_prompt = kwargs.get("initial_prompt", "<|startoftranscript|><|ASR|>")
        vocab_path = kwargs.get("vocab_path", None)
        options = whisper.DecodingOptions(language=language, fp16=False, without_timestamps=True, initial_prompt=initial_prompt, vocab_path=vocab_path)
        DecodingOptions["language"] = language
        DecodingOptions["vocab_path"] = kwargs.get("vocab_path", None)
        if "without_timestamps" not in DecodingOptions:
            DecodingOptions["without_timestamps"] = True
        options = whisper.DecodingOptions(**DecodingOptions)
        
        result = whisper.decode(self.model, speech, options)