From b5d3df75cf6462aa3bf42fd3c86fa2aa7f1c8a15 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 24 十一月 2023 00:54:44 +0800
Subject: [PATCH] setup jamo

---
 funasr/bin/asr_infer.py |   52 +++++++++++++++++++++++++++++-----------------------
 1 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index 43da8bf..a1cede1 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -34,21 +34,19 @@
 from funasr.modules.scorers.ctc import CTCPrefixScorer
 from funasr.modules.scorers.length_bonus import LengthBonus
 from funasr.build_utils.build_asr_model import frontend_choices
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
+from funasr.tokenizer.build_tokenizer import build_tokenizer
+from funasr.tokenizer.token_id_converter import TokenIDConverter
 from funasr.torch_utils.device_funcs import to_device
 from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
-from funasr.utils.whisper_utils.decoding import DecodingOptions, detect_language, decode
-from funasr.utils.whisper_utils.transcribe import transcribe
-from funasr.utils.whisper_utils.audio import pad_or_trim, log_mel_spectrogram
+
 
 class Speech2Text:
     """Speech2Text class
 
     Examples:
-        >>> import soundfile
+        >>> import librosa
         >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
-        >>> audio, rate = soundfile.read("speech.wav")
+        >>> audio, rate = librosa.load("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
 
@@ -253,9 +251,9 @@
     """Speech2Text class
 
     Examples:
-            >>> import soundfile
+            >>> import librosa
             >>> speech2text = Speech2TextParaformer("asr_config.yml", "asr.pb")
-            >>> audio, rate = soundfile.read("speech.wav")
+            >>> audio, rate = librosa.load("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
 
@@ -627,9 +625,9 @@
     """Speech2Text class
 
     Examples:
-            >>> import soundfile
+            >>> import librosa
             >>> speech2text = Speech2TextParaformerOnline("asr_config.yml", "asr.pth")
-            >>> audio, rate = soundfile.read("speech.wav")
+            >>> audio, rate = librosa.load("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
 
@@ -878,9 +876,9 @@
     """Speech2Text class
 
     Examples:
-        >>> import soundfile
+        >>> import librosa
         >>> speech2text = Speech2TextUniASR("asr_config.yml", "asr.pb")
-        >>> audio, rate = soundfile.read("speech.wav")
+        >>> audio, rate = librosa.load("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
 
@@ -1108,9 +1106,9 @@
     """Speech2Text class
 
     Examples:
-        >>> import soundfile
+        >>> import librosa
         >>> speech2text = Speech2TextMFCCA("asr_config.yml", "asr.pb")
-        >>> audio, rate = soundfile.read("speech.wav")
+        >>> audio, rate = librosa.load("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
 
@@ -1607,7 +1605,6 @@
         feats_lengths = to_device(feats_lengths, device=self.device)
 
         enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths)
-
         nbest_hyps = self.beam_search(enc_out[0])
 
         return nbest_hyps
@@ -1640,9 +1637,9 @@
     """Speech2Text class
 
     Examples:
-        >>> import soundfile
+        >>> import librosa
         >>> speech2text = Speech2TextSAASR("asr_config.yml", "asr.pb")
-        >>> audio, rate = soundfile.read("speech.wav")
+        >>> audio, rate = librosa.load("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
 
@@ -1888,9 +1885,9 @@
     """Speech2Text class
 
     Examples:
-        >>> import soundfile
+        >>> import librosa
         >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
-        >>> audio, rate = soundfile.read("speech.wav")
+        >>> audio, rate = librosa.load("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
 
@@ -1918,12 +1915,15 @@
             nbest: int = 1,
             streaming: bool = False,
             frontend_conf: dict = None,
+            language: str = None,
+            task: str = "transcribe",
             **kwargs,
     ):
 
+        from funasr.tasks.whisper import ASRTask
+
         # 1. Build ASR model
         scorers = {}
-        from funasr.tasks.whisper import ASRTask
         asr_model, asr_train_args = ASRTask.build_model_from_file(
             asr_train_config, asr_model_file, cmvn_file, device
         )
@@ -1960,6 +1960,8 @@
         self.device = device
         self.dtype = dtype
         self.frontend = frontend
+        self.language = language
+        self.task = task
 
     @torch.no_grad()
     def __call__(
@@ -1981,15 +1983,19 @@
 
         """
 
+        from funasr.utils.whisper_utils.transcribe import transcribe
+        from funasr.utils.whisper_utils.audio import pad_or_trim, log_mel_spectrogram
+        from funasr.utils.whisper_utils.decoding import DecodingOptions, detect_language, decode
+
         speech = speech[0]
         speech = pad_or_trim(speech)
         mel = log_mel_spectrogram(speech).to(self.device)
 
         if self.asr_model.is_multilingual:
-            options = DecodingOptions(fp16=False)
+            options = DecodingOptions(fp16=False, language=self.language, task=self.task)
             asr_res = decode(self.asr_model, mel, options)
             text = asr_res.text
-            language = asr_res.language
+            language = self.language if self.language else asr_res.language
         else:
             asr_res = transcribe(self.asr_model, speech, fp16=False)
             text = asr_res["text"]

--
Gitblit v1.9.1