| | |
| | | plt.savefig(plotname, bbox_inches="tight") |
| | | |
| | | def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List: |
| | | def convert_to_wav(input_path, output_path): |
| | | from pydub import AudioSegment |
| | | try: |
| | | audio = AudioSegment.from_mp3(input_path) |
| | | audio.export(output_path, format="wav") |
| | | print("音频文件为mp3格式,已转换为wav格式") |
| | | |
| | | except Exception as e: |
| | | print(f"转换失败:{e}") |
| | | |
| | | def load_wav(path: str) -> np.ndarray: |
| | | if not path.lower().endswith('.wav'): |
| | | import os |
| | | input_path = path |
| | | path = os.path.splitext(path)[0]+'.wav' |
| | | convert_to_wav(input_path,path) #将mp3格式转换成wav格式 |
| | | |
| | | waveform, _ = librosa.load(path, sr=fs) |
| | | return waveform |
| | | |
| | |
| | | self.pred_bias = config["model_conf"]["predictor_bias"] |
| | | else: |
| | | self.pred_bias = 0 |
| | | if "lang" in config: |
| | | self.language = config["lang"] |
| | | else: |
| | | self.language = None |
| | | |
| | | def __call__( |
| | | self, wav_content: Union[str, np.ndarray, List[str]], hotwords: str, **kwargs |
| | | ) -> List: |
| | | # def __call__( |
| | | # self, waveform_list:list, hotwords: str, **kwargs |
| | | # ) -> List: |
| | | # make hotword list |
| | | hotwords, hotwords_length = self.proc_hotword(hotwords) |
| | | # import pdb; pdb.set_trace() |
| | | [bias_embed] = self.eb_infer(hotwords, hotwords_length) |
| | | # index from bias_embed |
| | | bias_embed = bias_embed.transpose(1, 0, 2) |
| | |
| | | try: |
| | | outputs = self.bb_infer(feats, feats_len, bias_embed) |
| | | am_scores, valid_token_lens = outputs[0], outputs[1] |
| | | |
| | | if len(outputs) == 4: |
| | | # for BiCifParaformer Inference |
| | | us_alphas, us_peaks = outputs[2], outputs[3] |
| | | else: |
| | | us_alphas, us_peaks = None, None |
| | | |
| | | except ONNXRuntimeError: |
| | | # logging.warning(traceback.format_exc()) |
| | | logging.warning("input wav is silence or noise") |
| | | preds = [""] |
| | | else: |
| | | preds = self.decode(am_scores, valid_token_lens) |
| | | for pred in preds: |
| | | pred = sentence_postprocess(pred) |
| | | asr_res.append({"preds": pred}) |
| | | if us_peaks is None: |
| | | for pred in preds: |
| | | if self.language == "en-bpe": |
| | | pred = sentence_postprocess_sentencepiece(pred) |
| | | else: |
| | | pred = sentence_postprocess(pred) |
| | | asr_res.append({"preds": pred}) |
| | | else: |
| | | for pred, us_peaks_ in zip(preds, us_peaks): |
| | | raw_tokens = pred |
| | | timestamp, timestamp_raw = time_stamp_lfr6_onnx( |
| | | us_peaks_, copy.copy(raw_tokens) |
| | | ) |
| | | text_proc, timestamp_proc, _ = sentence_postprocess( |
| | | raw_tokens, timestamp_raw |
| | | ) |
| | | # logging.warning(timestamp) |
| | | if len(self.plot_timestamp_to): |
| | | self.plot_wave_timestamp( |
| | | waveform_list[0], timestamp, self.plot_timestamp_to |
| | | ) |
| | | asr_res.append( |
| | | { |
| | | "preds": text_proc, |
| | | "timestamp": timestamp_proc, |
| | | "raw_tokens": raw_tokens, |
| | | } |
| | | ) |
| | | return asr_res |
| | | |
| | | def proc_hotword(self, hotwords): |
| | |
| | | return np.array(hotwords) |
| | | |
| | | hotword_int = [word_map(i) for i in hotwords] |
| | | # import pdb; pdb.set_trace() |
| | | |
| | | hotword_int.append(np.array([1])) |
| | | hotwords = pad_list(hotword_int, pad_value=0, max_len=10) |
| | | # import pdb; pdb.set_trace() |
| | | |
| | | return hotwords, hotwords_length |
| | | |
| | | def bb_infer( |