update egs_modelscope paraformer-large-en
| | |
| | | |
| | | if __name__ == "__main__": |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--decoding_mode', type=str, default="normal") |
| | |
| | | |
| | | stage=1 |
| | | stop_stage=2 |
| | | model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | model="damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020" |
| | | data_dir="./data/test" |
| | | output_dir="./results" |
| | | batch_size=64 |
| | |
| | | nbest: int = 1, |
| | | streaming: bool = False, |
| | | frontend_conf: dict = None, |
| | | language: str = None, |
| | | task: str = "transcribe", |
| | | **kwargs, |
| | | ): |
| | | |
| | |
| | | self.device = device |
| | | self.dtype = dtype |
| | | self.frontend = frontend |
| | | self.language = language |
| | | self.task = task |
| | | |
| | | @torch.no_grad() |
| | | def __call__( |
| | |
| | | mel = log_mel_spectrogram(speech).to(self.device) |
| | | |
| | | if self.asr_model.is_multilingual: |
| | | options = DecodingOptions(fp16=False) |
| | | options = DecodingOptions(fp16=False, language=self.language, task=self.task) |
| | | asr_res = decode(self.asr_model, mel, options) |
| | | text = asr_res.text |
| | | language = asr_res.language |
| | | language = self.language if self.language else asr_res.language |
| | | else: |
| | | asr_res = transcribe(self.asr_model, speech, fp16=False) |
| | | text = asr_res["text"] |
| | |
| | | |
| | | ncpu = kwargs.get("ncpu", 1) |
| | | torch.set_num_threads(ncpu) |
| | | language = param_dict.get("language", None) |
| | | task = param_dict.get("task", "transcribe") |
| | | if batch_size > 1: |
| | | raise NotImplementedError("batch decoding is not implemented") |
| | | if word_lm_train_config is not None: |
| | |
| | | penalty=penalty, |
| | | nbest=nbest, |
| | | streaming=streaming, |
| | | language=language, |
| | | task=task, |
| | | ) |
| | | logging.info("speech2text_kwargs: {}".format(speech2text_kwargs)) |
| | | speech2text = Speech2TextWhisper(**speech2text_kwargs) |