From 242431452b682b6bf5d711506653605ed8786af0 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 29 三月 2023 00:30:57 +0800
Subject: [PATCH] export
---
/dev/null | 0
1 files changed, 0 insertions(+), 0 deletions(-)
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py
deleted file mode 100644
index 4750479..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# -*- encoding: utf-8 -*-
-from .paraformer_bin import Paraformer
-from .vad_bin import Fsmn_vad
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py
deleted file mode 100644
index cbdb8d9..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# -*- encoding: utf-8 -*-
-
-import os.path
-from pathlib import Path
-from typing import List, Union, Tuple
-
-import copy
-import librosa
-import numpy as np
-
-from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
- OrtInferSession, TokenIDConverter, get_logger,
- read_yaml)
-from .utils.postprocess_utils import sentence_postprocess
-from .utils.frontend import WavFrontend
-from .utils.timestamp_utils import time_stamp_lfr6_onnx
-
-logging = get_logger()
-
-
-class Paraformer():
- def __init__(self, model_dir: Union[str, Path] = None,
- batch_size: int = 1,
- device_id: Union[str, int] = "-1",
- plot_timestamp_to: str = "",
- pred_bias: int = 1,
- quantize: bool = False,
- intra_op_num_threads: int = 4,
- ):
-
- if not Path(model_dir).exists():
- raise FileNotFoundError(f'{model_dir} does not exist.')
-
- model_file = os.path.join(model_dir, 'model.onnx')
- if quantize:
- model_file = os.path.join(model_dir, 'model_quant.onnx')
- config_file = os.path.join(model_dir, 'config.yaml')
- cmvn_file = os.path.join(model_dir, 'am.mvn')
- config = read_yaml(config_file)
-
- self.converter = TokenIDConverter(config['token_list'])
- self.tokenizer = CharTokenizer()
- self.frontend = WavFrontend(
- cmvn_file=cmvn_file,
- **config['frontend_conf']
- )
- self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
- self.batch_size = batch_size
- self.plot_timestamp_to = plot_timestamp_to
- self.pred_bias = pred_bias
-
- def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
- waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
- waveform_nums = len(waveform_list)
- asr_res = []
- for beg_idx in range(0, waveform_nums, self.batch_size):
-
- end_idx = min(waveform_nums, beg_idx + self.batch_size)
- feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
- try:
- outputs = self.infer(feats, feats_len)
- am_scores, valid_token_lens = outputs[0], outputs[1]
- if len(outputs) == 4:
- # for BiCifParaformer Inference
- us_alphas, us_peaks = outputs[2], outputs[3]
- else:
- us_alphas, us_peaks = None, None
- except ONNXRuntimeError:
- #logging.warning(traceback.format_exc())
- logging.warning("input wav is silence or noise")
- preds = ['']
- else:
- preds = self.decode(am_scores, valid_token_lens)
- if us_peaks is None:
- for pred in preds:
- pred = sentence_postprocess(pred)
- asr_res.append({'preds': pred})
- else:
- for pred, us_peaks_ in zip(preds, us_peaks):
- raw_tokens = pred
- timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
- text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
- # logging.warning(timestamp)
- if len(self.plot_timestamp_to):
- self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
- asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
- return asr_res
-
- def plot_wave_timestamp(self, wav, text_timestamp, dest):
- # TODO: Plot the wav and timestamp results with matplotlib
- import matplotlib
- matplotlib.use('Agg')
- matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
- import matplotlib.pyplot as plt
- fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
- ax2 = ax1.twinx()
- ax2.set_ylim([0, 2.0])
- # plot waveform
- ax1.set_ylim([-0.3, 0.3])
- time = np.arange(wav.shape[0]) / 16000
- ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
- # plot lines and text
- for (char, start, end) in text_timestamp:
- ax1.vlines(start, -0.3, 0.3, ls='--')
- ax1.vlines(end, -0.3, 0.3, ls='--')
- x_adj = 0.045 if char != '<sil>' else 0.12
- ax1.text((start + end) * 0.5 - x_adj, 0, char)
- # plt.legend()
- plotname = "{}/timestamp.png".format(dest)
- plt.savefig(plotname, bbox_inches='tight')
-
- def load_data(self,
- wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
- def load_wav(path: str) -> np.ndarray:
- waveform, _ = librosa.load(path, sr=fs)
- return waveform
-
- if isinstance(wav_content, np.ndarray):
- return [wav_content]
-
- if isinstance(wav_content, str):
- return [load_wav(wav_content)]
-
- if isinstance(wav_content, list):
- return [load_wav(path) for path in wav_content]
-
- raise TypeError(
- f'The type of {wav_content} is not in [str, np.ndarray, list]')
-
- def extract_feat(self,
- waveform_list: List[np.ndarray]
- ) -> Tuple[np.ndarray, np.ndarray]:
- feats, feats_len = [], []
- for waveform in waveform_list:
- speech, _ = self.frontend.fbank(waveform)
- feat, feat_len = self.frontend.lfr_cmvn(speech)
- feats.append(feat)
- feats_len.append(feat_len)
-
- feats = self.pad_feats(feats, np.max(feats_len))
- feats_len = np.array(feats_len).astype(np.int32)
- return feats, feats_len
-
- @staticmethod
- def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
- def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
- pad_width = ((0, max_feat_len - cur_len), (0, 0))
- return np.pad(feat, pad_width, 'constant', constant_values=0)
-
- feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
- feats = np.array(feat_res).astype(np.float32)
- return feats
-
- def infer(self, feats: np.ndarray,
- feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- outputs = self.ort_infer([feats, feats_len])
- return outputs
-
- def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
- return [self.decode_one(am_score, token_num)
- for am_score, token_num in zip(am_scores, token_nums)]
-
- def decode_one(self,
- am_score: np.ndarray,
- valid_token_num: int) -> List[str]:
- yseq = am_score.argmax(axis=-1)
- score = am_score.max(axis=-1)
- score = np.sum(score, axis=-1)
-
- # pad with mask tokens to ensure compatibility with sos/eos tokens
- # asr_model.sos:1 asr_model.eos:2
- yseq = np.array([1] + yseq.tolist() + [2])
- hyp = Hypothesis(yseq=yseq, score=score)
-
- # remove sos/eos and get results
- last_pos = -1
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x not in (0, 2), token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
- token = token[:valid_token_num-self.pred_bias]
- # texts = sentence_postprocess(token)
- return token
-
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py
deleted file mode 100644
index e69de29..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py
+++ /dev/null
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py
+++ /dev/null
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py
deleted file mode 100644
index 8eed22f..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py
+++ /dev/null
@@ -1,607 +0,0 @@
-from enum import Enum
-from typing import List, Tuple, Dict, Any
-
-import math
-import numpy as np
-
-class VadStateMachine(Enum):
- kVadInStateStartPointNotDetected = 1
- kVadInStateInSpeechSegment = 2
- kVadInStateEndPointDetected = 3
-
-
-class FrameState(Enum):
- kFrameStateInvalid = -1
- kFrameStateSpeech = 1
- kFrameStateSil = 0
-
-
-# final voice/unvoice state per frame
-class AudioChangeState(Enum):
- kChangeStateSpeech2Speech = 0
- kChangeStateSpeech2Sil = 1
- kChangeStateSil2Sil = 2
- kChangeStateSil2Speech = 3
- kChangeStateNoBegin = 4
- kChangeStateInvalid = 5
-
-
-class VadDetectMode(Enum):
- kVadSingleUtteranceDetectMode = 0
- kVadMutipleUtteranceDetectMode = 1
-
-
-class VADXOptions:
- def __init__(
- self,
- sample_rate: int = 16000,
- detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
- snr_mode: int = 0,
- max_end_silence_time: int = 800,
- max_start_silence_time: int = 3000,
- do_start_point_detection: bool = True,
- do_end_point_detection: bool = True,
- window_size_ms: int = 200,
- sil_to_speech_time_thres: int = 150,
- speech_to_sil_time_thres: int = 150,
- speech_2_noise_ratio: float = 1.0,
- do_extend: int = 1,
- lookback_time_start_point: int = 200,
- lookahead_time_end_point: int = 100,
- max_single_segment_time: int = 60000,
- nn_eval_block_size: int = 8,
- dcd_block_size: int = 4,
- snr_thres: int = -100.0,
- noise_frame_num_used_for_snr: int = 100,
- decibel_thres: int = -100.0,
- speech_noise_thres: float = 0.6,
- fe_prior_thres: float = 1e-4,
- silence_pdf_num: int = 1,
- sil_pdf_ids: List[int] = [0],
- speech_noise_thresh_low: float = -0.1,
- speech_noise_thresh_high: float = 0.3,
- output_frame_probs: bool = False,
- frame_in_ms: int = 10,
- frame_length_ms: int = 25,
- ):
- self.sample_rate = sample_rate
- self.detect_mode = detect_mode
- self.snr_mode = snr_mode
- self.max_end_silence_time = max_end_silence_time
- self.max_start_silence_time = max_start_silence_time
- self.do_start_point_detection = do_start_point_detection
- self.do_end_point_detection = do_end_point_detection
- self.window_size_ms = window_size_ms
- self.sil_to_speech_time_thres = sil_to_speech_time_thres
- self.speech_to_sil_time_thres = speech_to_sil_time_thres
- self.speech_2_noise_ratio = speech_2_noise_ratio
- self.do_extend = do_extend
- self.lookback_time_start_point = lookback_time_start_point
- self.lookahead_time_end_point = lookahead_time_end_point
- self.max_single_segment_time = max_single_segment_time
- self.nn_eval_block_size = nn_eval_block_size
- self.dcd_block_size = dcd_block_size
- self.snr_thres = snr_thres
- self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
- self.decibel_thres = decibel_thres
- self.speech_noise_thres = speech_noise_thres
- self.fe_prior_thres = fe_prior_thres
- self.silence_pdf_num = silence_pdf_num
- self.sil_pdf_ids = sil_pdf_ids
- self.speech_noise_thresh_low = speech_noise_thresh_low
- self.speech_noise_thresh_high = speech_noise_thresh_high
- self.output_frame_probs = output_frame_probs
- self.frame_in_ms = frame_in_ms
- self.frame_length_ms = frame_length_ms
-
-
-class E2EVadSpeechBufWithDoa(object):
- def __init__(self):
- self.start_ms = 0
- self.end_ms = 0
- self.buffer = []
- self.contain_seg_start_point = False
- self.contain_seg_end_point = False
- self.doa = 0
-
- def Reset(self):
- self.start_ms = 0
- self.end_ms = 0
- self.buffer = []
- self.contain_seg_start_point = False
- self.contain_seg_end_point = False
- self.doa = 0
-
-
-class E2EVadFrameProb(object):
- def __init__(self):
- self.noise_prob = 0.0
- self.speech_prob = 0.0
- self.score = 0.0
- self.frame_id = 0
- self.frm_state = 0
-
-
-class WindowDetector(object):
- def __init__(self, window_size_ms: int, sil_to_speech_time: int,
- speech_to_sil_time: int, frame_size_ms: int):
- self.window_size_ms = window_size_ms
- self.sil_to_speech_time = sil_to_speech_time
- self.speech_to_sil_time = speech_to_sil_time
- self.frame_size_ms = frame_size_ms
-
- self.win_size_frame = int(window_size_ms / frame_size_ms)
- self.win_sum = 0
- self.win_state = [0] * self.win_size_frame # 鍒濆鍖栫獥
-
- self.cur_win_pos = 0
- self.pre_frame_state = FrameState.kFrameStateSil
- self.cur_frame_state = FrameState.kFrameStateSil
- self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
- self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
-
- self.voice_last_frame_count = 0
- self.noise_last_frame_count = 0
- self.hydre_frame_count = 0
-
- def Reset(self) -> None:
- self.cur_win_pos = 0
- self.win_sum = 0
- self.win_state = [0] * self.win_size_frame
- self.pre_frame_state = FrameState.kFrameStateSil
- self.cur_frame_state = FrameState.kFrameStateSil
- self.voice_last_frame_count = 0
- self.noise_last_frame_count = 0
- self.hydre_frame_count = 0
-
- def GetWinSize(self) -> int:
- return int(self.win_size_frame)
-
- def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
- cur_frame_state = FrameState.kFrameStateSil
- if frameState == FrameState.kFrameStateSpeech:
- cur_frame_state = 1
- elif frameState == FrameState.kFrameStateSil:
- cur_frame_state = 0
- else:
- return AudioChangeState.kChangeStateInvalid
- self.win_sum -= self.win_state[self.cur_win_pos]
- self.win_sum += cur_frame_state
- self.win_state[self.cur_win_pos] = cur_frame_state
- self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
-
- if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
- self.pre_frame_state = FrameState.kFrameStateSpeech
- return AudioChangeState.kChangeStateSil2Speech
-
- if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
- self.pre_frame_state = FrameState.kFrameStateSil
- return AudioChangeState.kChangeStateSpeech2Sil
-
- if self.pre_frame_state == FrameState.kFrameStateSil:
- return AudioChangeState.kChangeStateSil2Sil
- if self.pre_frame_state == FrameState.kFrameStateSpeech:
- return AudioChangeState.kChangeStateSpeech2Speech
- return AudioChangeState.kChangeStateInvalid
-
- def FrameSizeMs(self) -> int:
- return int(self.frame_size_ms)
-
-
-class E2EVadModel():
- def __init__(self, vad_post_args: Dict[str, Any]):
- super(E2EVadModel, self).__init__()
- self.vad_opts = VADXOptions(**vad_post_args)
- self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
- self.vad_opts.sil_to_speech_time_thres,
- self.vad_opts.speech_to_sil_time_thres,
- self.vad_opts.frame_in_ms)
- # self.encoder = encoder
- # init variables
- self.is_final = False
- self.data_buf_start_frame = 0
- self.frm_cnt = 0
- self.latest_confirmed_speech_frame = 0
- self.lastest_confirmed_silence_frame = -1
- self.continous_silence_frame_count = 0
- self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
- self.confirmed_start_frame = -1
- self.confirmed_end_frame = -1
- self.number_end_time_detected = 0
- self.sil_frame = 0
- self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
- self.noise_average_decibel = -100.0
- self.pre_end_silence_detected = False
- self.next_seg = True
-
- self.output_data_buf = []
- self.output_data_buf_offset = 0
- self.frame_probs = []
- self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
- self.speech_noise_thres = self.vad_opts.speech_noise_thres
- self.scores = None
- self.max_time_out = False
- self.decibel = []
- self.data_buf = None
- self.data_buf_all = None
- self.waveform = None
- self.ResetDetection()
-
- def AllResetDetection(self):
- self.is_final = False
- self.data_buf_start_frame = 0
- self.frm_cnt = 0
- self.latest_confirmed_speech_frame = 0
- self.lastest_confirmed_silence_frame = -1
- self.continous_silence_frame_count = 0
- self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
- self.confirmed_start_frame = -1
- self.confirmed_end_frame = -1
- self.number_end_time_detected = 0
- self.sil_frame = 0
- self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
- self.noise_average_decibel = -100.0
- self.pre_end_silence_detected = False
- self.next_seg = True
-
- self.output_data_buf = []
- self.output_data_buf_offset = 0
- self.frame_probs = []
- self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
- self.speech_noise_thres = self.vad_opts.speech_noise_thres
- self.scores = None
- self.max_time_out = False
- self.decibel = []
- self.data_buf = None
- self.data_buf_all = None
- self.waveform = None
- self.ResetDetection()
-
- def ResetDetection(self):
- self.continous_silence_frame_count = 0
- self.latest_confirmed_speech_frame = 0
- self.lastest_confirmed_silence_frame = -1
- self.confirmed_start_frame = -1
- self.confirmed_end_frame = -1
- self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
- self.windows_detector.Reset()
- self.sil_frame = 0
- self.frame_probs = []
-
- def ComputeDecibel(self) -> None:
- frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
- frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
- if self.data_buf_all is None:
- self.data_buf_all = self.waveform[0] # self.data_buf is pointed to self.waveform[0]
- self.data_buf = self.data_buf_all
- else:
- self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
- for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
- self.decibel.append(
- 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
- 0.000001))
-
- def ComputeScores(self, scores: np.ndarray) -> None:
- # scores = self.encoder(feats, in_cache) # return B * T * D
- self.vad_opts.nn_eval_block_size = scores.shape[1]
- self.frm_cnt += scores.shape[1] # count total frames
- if self.scores is None:
- self.scores = scores # the first calculation
- else:
- self.scores = np.concatenate((self.scores, scores), axis=1)
-
- def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again
- while self.data_buf_start_frame < frame_idx:
- if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
- self.data_buf_start_frame += 1
- self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
- self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
-
- def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
- last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
- self.PopDataBufTillFrame(start_frm)
- expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
- if last_frm_is_end_point:
- extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
- self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
- expected_sample_number += int(extra_sample)
- if end_point_is_sent_end:
- expected_sample_number = max(expected_sample_number, len(self.data_buf))
- if len(self.data_buf) < expected_sample_number:
- print('error in calling pop data_buf\n')
-
- if len(self.output_data_buf) == 0 or first_frm_is_start_point:
- self.output_data_buf.append(E2EVadSpeechBufWithDoa())
- self.output_data_buf[-1].Reset()
- self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
- self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
- self.output_data_buf[-1].doa = 0
- cur_seg = self.output_data_buf[-1]
- if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
- print('warning\n')
- out_pos = len(cur_seg.buffer) # cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
- data_to_pop = 0
- if end_point_is_sent_end:
- data_to_pop = expected_sample_number
- else:
- data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
- if data_to_pop > len(self.data_buf):
- print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
- data_to_pop = len(self.data_buf)
- expected_sample_number = len(self.data_buf)
-
- cur_seg.doa = 0
- for sample_cpy_out in range(0, data_to_pop):
- # cur_seg.buffer[out_pos ++] = data_buf_.back();
- out_pos += 1
- for sample_cpy_out in range(data_to_pop, expected_sample_number):
- # cur_seg.buffer[out_pos++] = data_buf_.back()
- out_pos += 1
- if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
- print('Something wrong with the VAD algorithm\n')
- self.data_buf_start_frame += frm_cnt
- cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
- if first_frm_is_start_point:
- cur_seg.contain_seg_start_point = True
- if last_frm_is_end_point:
- cur_seg.contain_seg_end_point = True
-
- def OnSilenceDetected(self, valid_frame: int):
- self.lastest_confirmed_silence_frame = valid_frame
- if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
- self.PopDataBufTillFrame(valid_frame)
- # silence_detected_callback_
- # pass
-
- def OnVoiceDetected(self, valid_frame: int) -> None:
- self.latest_confirmed_speech_frame = valid_frame
- self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
-
- def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
- if self.vad_opts.do_start_point_detection:
- pass
- if self.confirmed_start_frame != -1:
- print('not reset vad properly\n')
- else:
- self.confirmed_start_frame = start_frame
-
- if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
- self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
-
- def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
- for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
- self.OnVoiceDetected(t)
- if self.vad_opts.do_end_point_detection:
- pass
- if self.confirmed_end_frame != -1:
- print('not reset vad properly\n')
- else:
- self.confirmed_end_frame = end_frame
- if not fake_result:
- self.sil_frame = 0
- self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
- self.number_end_time_detected += 1
-
- def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
- if is_final_frame:
- self.OnVoiceEnd(cur_frm_idx, False, True)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
-
- def GetLatency(self) -> int:
- return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
-
- def LatencyFrmNumAtStartPoint(self) -> int:
- vad_latency = self.windows_detector.GetWinSize()
- if self.vad_opts.do_extend:
- vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
- return vad_latency
-
- def GetFrameState(self, t: int) -> FrameState:
- frame_state = FrameState.kFrameStateInvalid
- cur_decibel = self.decibel[t]
- cur_snr = cur_decibel - self.noise_average_decibel
- # for each frame, calc log posterior probability of each state
- if cur_decibel < self.vad_opts.decibel_thres:
- frame_state = FrameState.kFrameStateSil
- self.DetectOneFrame(frame_state, t, False)
- return frame_state
-
- sum_score = 0.0
- noise_prob = 0.0
- assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
- if len(self.sil_pdf_ids) > 0:
- assert len(self.scores) == 1 # 鍙敮鎸乥atch_size = 1鐨勬祴璇�
- sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
- sum_score = sum(sil_pdf_scores)
- noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
- total_score = 1.0
- sum_score = total_score - sum_score
- speech_prob = math.log(sum_score)
- if self.vad_opts.output_frame_probs:
- frame_prob = E2EVadFrameProb()
- frame_prob.noise_prob = noise_prob
- frame_prob.speech_prob = speech_prob
- frame_prob.score = sum_score
- frame_prob.frame_id = t
- self.frame_probs.append(frame_prob)
- if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
- if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
- frame_state = FrameState.kFrameStateSpeech
- else:
- frame_state = FrameState.kFrameStateSil
- else:
- frame_state = FrameState.kFrameStateSil
- if self.noise_average_decibel < -99.9:
- self.noise_average_decibel = cur_decibel
- else:
- self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
- self.vad_opts.noise_frame_num_used_for_snr
- - 1)) / self.vad_opts.noise_frame_num_used_for_snr
-
- return frame_state
-
-
- def __call__(self, score: np.ndarray, waveform: np.ndarray,
- is_final: bool = False, max_end_sil: int = 800
- ):
- self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
- self.waveform = waveform # compute decibel for each frame
- self.ComputeDecibel()
- self.ComputeScores(score)
- if not is_final:
- self.DetectCommonFrames()
- else:
- self.DetectLastFrames()
- segments = []
- for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now
- segment_batch = []
- if len(self.output_data_buf) > 0:
- for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
- if not self.output_data_buf[i].contain_seg_start_point:
- continue
- if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
- continue
- start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
- if self.output_data_buf[i].contain_seg_end_point:
- end_ms = self.output_data_buf[i].end_ms
- self.next_seg = True
- self.output_data_buf_offset += 1
- else:
- end_ms = -1
- self.next_seg = False
- segment = [start_ms, end_ms]
- segment_batch.append(segment)
- if segment_batch:
- segments.append(segment_batch)
- if is_final:
- # reset class variables and clear the dict for the next query
- self.AllResetDetection()
- return segments
-
- def DetectCommonFrames(self) -> int:
- if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
- return 0
- for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
- frame_state = FrameState.kFrameStateInvalid
- frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
- self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
-
- return 0
-
- def DetectLastFrames(self) -> int:
- if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
- return 0
- for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
- frame_state = FrameState.kFrameStateInvalid
- frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
- if i != 0:
- self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
- else:
- self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
-
- return 0
-
- def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
- tmp_cur_frm_state = FrameState.kFrameStateInvalid
- if cur_frm_state == FrameState.kFrameStateSpeech:
- if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
- tmp_cur_frm_state = FrameState.kFrameStateSpeech
- else:
- tmp_cur_frm_state = FrameState.kFrameStateSil
- elif cur_frm_state == FrameState.kFrameStateSil:
- tmp_cur_frm_state = FrameState.kFrameStateSil
- state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
- frm_shift_in_ms = self.vad_opts.frame_in_ms
- if AudioChangeState.kChangeStateSil2Speech == state_change:
- silence_frame_count = self.continous_silence_frame_count
- self.continous_silence_frame_count = 0
- self.pre_end_silence_detected = False
- start_frame = 0
- if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
- start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
- self.OnVoiceStart(start_frame)
- self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
- for t in range(start_frame + 1, cur_frm_idx + 1):
- self.OnVoiceDetected(t)
- elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
- for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
- self.OnVoiceDetected(t)
- if cur_frm_idx - self.confirmed_start_frame + 1 > \
- self.vad_opts.max_single_segment_time / frm_shift_in_ms:
- self.OnVoiceEnd(cur_frm_idx, False, False)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- elif not is_final_frame:
- self.OnVoiceDetected(cur_frm_idx)
- else:
- self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
- else:
- pass
- elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
- self.continous_silence_frame_count = 0
- if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
- pass
- elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
- if cur_frm_idx - self.confirmed_start_frame + 1 > \
- self.vad_opts.max_single_segment_time / frm_shift_in_ms:
- self.OnVoiceEnd(cur_frm_idx, False, False)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- elif not is_final_frame:
- self.OnVoiceDetected(cur_frm_idx)
- else:
- self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
- else:
- pass
- elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
- self.continous_silence_frame_count = 0
- if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
- if cur_frm_idx - self.confirmed_start_frame + 1 > \
- self.vad_opts.max_single_segment_time / frm_shift_in_ms:
- self.max_time_out = True
- self.OnVoiceEnd(cur_frm_idx, False, False)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- elif not is_final_frame:
- self.OnVoiceDetected(cur_frm_idx)
- else:
- self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
- else:
- pass
- elif AudioChangeState.kChangeStateSil2Sil == state_change:
- self.continous_silence_frame_count += 1
- if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
- # silence timeout, return zero length decision
- if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
- self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
- or (is_final_frame and self.number_end_time_detected == 0):
- for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
- self.OnSilenceDetected(t)
- self.OnVoiceStart(0, True)
- self.OnVoiceEnd(0, True, False);
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- else:
- if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
- self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
- elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
- if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
- lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
- if self.vad_opts.do_extend:
- lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
- lookback_frame -= 1
- lookback_frame = max(0, lookback_frame)
- self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- elif cur_frm_idx - self.confirmed_start_frame + 1 > \
- self.vad_opts.max_single_segment_time / frm_shift_in_ms:
- self.OnVoiceEnd(cur_frm_idx, False, False)
- self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
- elif self.vad_opts.do_extend and not is_final_frame:
- if self.continous_silence_frame_count <= int(
- self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
- self.OnVoiceDetected(cur_frm_idx)
- else:
- self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
- else:
- pass
-
- if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
- self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
- self.ResetDetection()
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py
deleted file mode 100644
index 11a8644..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# -*- encoding: utf-8 -*-
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
-
-import numpy as np
-from typeguard import check_argument_types
-import kaldi_native_fbank as knf
-
-root_dir = Path(__file__).resolve().parent
-
-logger_initialized = {}
-
-
-class WavFrontend():
- """Conventional frontend structure for ASR.
- """
-
- def __init__(
- self,
- cmvn_file: str = None,
- fs: int = 16000,
- window: str = 'hamming',
- n_mels: int = 80,
- frame_length: int = 25,
- frame_shift: int = 10,
- lfr_m: int = 1,
- lfr_n: int = 1,
- dither: float = 1.0,
- **kwargs,
- ) -> None:
- check_argument_types()
-
- opts = knf.FbankOptions()
- opts.frame_opts.samp_freq = fs
- opts.frame_opts.dither = dither
- opts.frame_opts.window_type = window
- opts.frame_opts.frame_shift_ms = float(frame_shift)
- opts.frame_opts.frame_length_ms = float(frame_length)
- opts.mel_opts.num_bins = n_mels
- opts.energy_floor = 0
- opts.frame_opts.snip_edges = True
- opts.mel_opts.debug_mel = False
- self.opts = opts
-
- self.lfr_m = lfr_m
- self.lfr_n = lfr_n
- self.cmvn_file = cmvn_file
-
- if self.cmvn_file:
- self.cmvn = self.load_cmvn()
- self.fbank_fn = None
- self.fbank_beg_idx = 0
- self.reset_status()
-
- def fbank(self,
- waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- waveform = waveform * (1 << 15)
- self.fbank_fn = knf.OnlineFbank(self.opts)
- self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
- frames = self.fbank_fn.num_frames_ready
- mat = np.empty([frames, self.opts.mel_opts.num_bins])
- for i in range(frames):
- mat[i, :] = self.fbank_fn.get_frame(i)
- feat = mat.astype(np.float32)
- feat_len = np.array(mat.shape[0]).astype(np.int32)
- return feat, feat_len
-
- def fbank_online(self,
- waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- waveform = waveform * (1 << 15)
- # self.fbank_fn = knf.OnlineFbank(self.opts)
- self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
- frames = self.fbank_fn.num_frames_ready
- mat = np.empty([frames, self.opts.mel_opts.num_bins])
- for i in range(self.fbank_beg_idx, frames):
- mat[i, :] = self.fbank_fn.get_frame(i)
- # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
- feat = mat.astype(np.float32)
- feat_len = np.array(mat.shape[0]).astype(np.int32)
- return feat, feat_len
-
- def reset_status(self):
- self.fbank_fn = knf.OnlineFbank(self.opts)
- self.fbank_beg_idx = 0
-
- def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- if self.lfr_m != 1 or self.lfr_n != 1:
- feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
-
- if self.cmvn_file:
- feat = self.apply_cmvn(feat)
-
- feat_len = np.array(feat.shape[0]).astype(np.int32)
- return feat, feat_len
-
- @staticmethod
- def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
- LFR_inputs = []
-
- T = inputs.shape[0]
- T_lfr = int(np.ceil(T / lfr_n))
- left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
- inputs = np.vstack((left_padding, inputs))
- T = T + (lfr_m - 1) // 2
- for i in range(T_lfr):
- if lfr_m <= T - i * lfr_n:
- LFR_inputs.append(
- (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
- else:
- # process last LFR frame
- num_padding = lfr_m - (T - i * lfr_n)
- frame = inputs[i * lfr_n:].reshape(-1)
- for _ in range(num_padding):
- frame = np.hstack((frame, inputs[-1]))
-
- LFR_inputs.append(frame)
- LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
- return LFR_outputs
-
- def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
- """
- Apply CMVN with mvn data
- """
- frame, dim = inputs.shape
- means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
- vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
- inputs = (inputs + means) * vars
- return inputs
-
- def load_cmvn(self,) -> np.ndarray:
- with open(self.cmvn_file, 'r', encoding='utf-8') as f:
- lines = f.readlines()
-
- means_list = []
- vars_list = []
- for i in range(len(lines)):
- line_item = lines[i].split()
- if line_item[0] == '<AddShift>':
- line_item = lines[i + 1].split()
- if line_item[0] == '<LearnRateCoef>':
- add_shift_line = line_item[3:(len(line_item) - 1)]
- means_list = list(add_shift_line)
- continue
- elif line_item[0] == '<Rescale>':
- line_item = lines[i + 1].split()
- if line_item[0] == '<LearnRateCoef>':
- rescale_line = line_item[3:(len(line_item) - 1)]
- vars_list = list(rescale_line)
- continue
-
- means = np.array(means_list).astype(np.float64)
- vars = np.array(vars_list).astype(np.float64)
- cmvn = np.array([means, vars])
- return cmvn
-
-def load_bytes(input):
- middle_data = np.frombuffer(input, dtype=np.int16)
- middle_data = np.asarray(middle_data)
- if middle_data.dtype.kind not in 'iu':
- raise TypeError("'middle_data' must be an array of integers")
- dtype = np.dtype('float32')
- if dtype.kind != 'f':
- raise TypeError("'dtype' must be a floating point type")
-
- i = np.iinfo(middle_data.dtype)
- abs_max = 2 ** (i.bits - 1)
- offset = i.min + abs_max
- array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
- return array
-
-
-def test():
- path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
- import librosa
- cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
- config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
- from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
- config = read_yaml(config_file)
- waveform, _ = librosa.load(path, sr=None)
- frontend = WavFrontend(
- cmvn_file=cmvn_file,
- **config['frontend_conf'],
- )
- speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy
- feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
-
- frontend.reset_status() # clear cache
- return feat, feat_len
-
-if __name__ == '__main__':
- test()
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py
deleted file mode 100644
index 575fb90..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import string
-import logging
-from typing import Any, List, Union
-
-
-def isChinese(ch: str):
- if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
- return True
- return False
-
-
-def isAllChinese(word: Union[List[Any], str]):
- word_lists = []
- for i in word:
- cur = i.replace(' ', '')
- cur = cur.replace('</s>', '')
- cur = cur.replace('<s>', '')
- word_lists.append(cur)
-
- if len(word_lists) == 0:
- return False
-
- for ch in word_lists:
- if isChinese(ch) is False:
- return False
- return True
-
-
-def isAllAlpha(word: Union[List[Any], str]):
- word_lists = []
- for i in word:
- cur = i.replace(' ', '')
- cur = cur.replace('</s>', '')
- cur = cur.replace('<s>', '')
- word_lists.append(cur)
-
- if len(word_lists) == 0:
- return False
-
- for ch in word_lists:
- if ch.isalpha() is False and ch != "'":
- return False
- elif ch.isalpha() is True and isChinese(ch) is True:
- return False
-
- return True
-
-
-# def abbr_dispose(words: List[Any]) -> List[Any]:
-def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
- words_size = len(words)
- word_lists = []
- abbr_begin = []
- abbr_end = []
- last_num = -1
- ts_lists = []
- ts_nums = []
- ts_index = 0
- for num in range(words_size):
- if num <= last_num:
- continue
-
- if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
- if num + 1 < words_size and words[
- num + 1] == ' ' and num + 2 < words_size and len(
- words[num +
- 2]) == 1 and words[num +
- 2].encode('utf-8').isalpha():
- # found the begin of abbr
- abbr_begin.append(num)
- num += 2
- abbr_end.append(num)
- # to find the end of abbr
- while True:
- num += 1
- if num < words_size and words[num] == ' ':
- num += 1
- if num < words_size and len(
- words[num]) == 1 and words[num].encode(
- 'utf-8').isalpha():
- abbr_end.pop()
- abbr_end.append(num)
- last_num = num
- else:
- break
- else:
- break
-
- for num in range(words_size):
- if words[num] == ' ':
- ts_nums.append(ts_index)
- else:
- ts_nums.append(ts_index)
- ts_index += 1
- last_num = -1
- for num in range(words_size):
- if num <= last_num:
- continue
-
- if num in abbr_begin:
- if time_stamp is not None:
- begin = time_stamp[ts_nums[num]][0]
- word_lists.append(words[num].upper())
- num += 1
- while num < words_size:
- if num in abbr_end:
- word_lists.append(words[num].upper())
- last_num = num
- break
- else:
- if words[num].encode('utf-8').isalpha():
- word_lists.append(words[num].upper())
- num += 1
- if time_stamp is not None:
- end = time_stamp[ts_nums[num]][1]
- ts_lists.append([begin, end])
- else:
- word_lists.append(words[num])
- if time_stamp is not None and words[num] != ' ':
- begin = time_stamp[ts_nums[num]][0]
- end = time_stamp[ts_nums[num]][1]
- ts_lists.append([begin, end])
- begin = end
-
- if time_stamp is not None:
- return word_lists, ts_lists
- else:
- return word_lists
-
-
-def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
- middle_lists = []
- word_lists = []
- word_item = ''
- ts_lists = []
-
- # wash words lists
- for i in words:
- word = ''
- if isinstance(i, str):
- word = i
- else:
- word = i.decode('utf-8')
-
- if word in ['<s>', '</s>', '<unk>']:
- continue
- else:
- middle_lists.append(word)
-
- # all chinese characters
- if isAllChinese(middle_lists):
- for i, ch in enumerate(middle_lists):
- word_lists.append(ch.replace(' ', ''))
- if time_stamp is not None:
- ts_lists = time_stamp
-
- # all alpha characters
- elif isAllAlpha(middle_lists):
- ts_flag = True
- for i, ch in enumerate(middle_lists):
- if ts_flag and time_stamp is not None:
- begin = time_stamp[i][0]
- end = time_stamp[i][1]
- word = ''
- if '@@' in ch:
- word = ch.replace('@@', '')
- word_item += word
- if time_stamp is not None:
- ts_flag = False
- end = time_stamp[i][1]
- else:
- word_item += ch
- word_lists.append(word_item)
- word_lists.append(' ')
- word_item = ''
- if time_stamp is not None:
- ts_flag = True
- end = time_stamp[i][1]
- ts_lists.append([begin, end])
- begin = end
-
- # mix characters
- else:
- alpha_blank = False
- ts_flag = True
- begin = -1
- end = -1
- for i, ch in enumerate(middle_lists):
- if ts_flag and time_stamp is not None:
- begin = time_stamp[i][0]
- end = time_stamp[i][1]
- word = ''
- if isAllChinese(ch):
- if alpha_blank is True:
- word_lists.pop()
- word_lists.append(ch)
- alpha_blank = False
- if time_stamp is not None:
- ts_flag = True
- ts_lists.append([begin, end])
- begin = end
- elif '@@' in ch:
- word = ch.replace('@@', '')
- word_item += word
- alpha_blank = False
- if time_stamp is not None:
- ts_flag = False
- end = time_stamp[i][1]
- elif isAllAlpha(ch):
- word_item += ch
- word_lists.append(word_item)
- word_lists.append(' ')
- word_item = ''
- alpha_blank = True
- if time_stamp is not None:
- ts_flag = True
- end = time_stamp[i][1]
- ts_lists.append([begin, end])
- begin = end
- else:
- raise ValueError('invalid character: {}'.format(ch))
-
- if time_stamp is not None:
- word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
- real_word_lists = []
- for ch in word_lists:
- if ch != ' ':
- real_word_lists.append(ch)
- sentence = ' '.join(real_word_lists).strip()
- return sentence, ts_lists, real_word_lists
- else:
- word_lists = abbr_dispose(word_lists)
- real_word_lists = []
- for ch in word_lists:
- if ch != ' ':
- real_word_lists.append(ch)
- sentence = ''.join(word_lists).strip()
- return sentence, real_word_lists
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py
deleted file mode 100644
index 3a01812..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import numpy as np
-
-
-def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
- if not len(char_list):
- return []
- START_END_THRESHOLD = 5
- MAX_TOKEN_DURATION = 30
- TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
- cif_peak = us_cif_peak.reshape(-1)
- num_frames = cif_peak.shape[-1]
- if char_list[-1] == '</s>':
- char_list = char_list[:-1]
- # char_list = [i for i in text]
- timestamp_list = []
- new_char_list = []
- # for bicif model trained with large data, cif2 actually fires when a character starts
- # so treat the frames between two peaks as the duration of the former token
- fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
- num_peak = len(fire_place)
- assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
- # begin silence
- if fire_place[0] > START_END_THRESHOLD:
- # char_list.insert(0, '<sil>')
- timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
- new_char_list.append('<sil>')
- # tokens timestamp
- for i in range(len(fire_place)-1):
- new_char_list.append(char_list[i])
- if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
- timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
- else:
- # cut the duration to token and sil of the 0-weight frames last long
- _split = fire_place[i] + MAX_TOKEN_DURATION
- timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE])
- timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE])
- new_char_list.append('<sil>')
- # tail token and end silence
- if num_frames - fire_place[-1] > START_END_THRESHOLD:
- _end = (num_frames + fire_place[-1]) / 2
- timestamp_list[-1][1] = _end*TIME_RATE
- timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE])
- new_char_list.append("<sil>")
- else:
- timestamp_list[-1][1] = num_frames*TIME_RATE
- if begin_time: # add offset time in model with vad
- for i in range(len(timestamp_list)):
- timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
- timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
- assert len(new_char_list) == len(timestamp_list)
- res_str = ""
- for char, timestamp in zip(new_char_list, timestamp_list):
- res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
- res = []
- for char, timestamp in zip(new_char_list, timestamp_list):
- if char != '<sil>':
- res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
- return res_str, res
-
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py
deleted file mode 100644
index 2edde11..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# -*- encoding: utf-8 -*-
-
-import functools
-import logging
-import pickle
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
-
-import numpy as np
-import yaml
-from onnxruntime import (GraphOptimizationLevel, InferenceSession,
- SessionOptions, get_available_providers, get_device)
-from typeguard import check_argument_types
-
-import warnings
-
-root_dir = Path(__file__).resolve().parent
-
-logger_initialized = {}
-
-
-class TokenIDConverter():
- def __init__(self, token_list: Union[List, str],
- ):
- check_argument_types()
-
- # self.token_list = self.load_token(token_path)
- self.token_list = token_list
- self.unk_symbol = token_list[-1]
-
- # @staticmethod
- # def load_token(file_path: Union[Path, str]) -> List:
- # if not Path(file_path).exists():
- # raise TokenIDConverterError(f'The {file_path} does not exist.')
- #
- # with open(str(file_path), 'rb') as f:
- # token_list = pickle.load(f)
- #
- # if len(token_list) != len(set(token_list)):
- # raise TokenIDConverterError('The Token exists duplicated symbol.')
- # return token_list
-
- def get_num_vocabulary_size(self) -> int:
- return len(self.token_list)
-
- def ids2tokens(self,
- integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
- if isinstance(integers, np.ndarray) and integers.ndim != 1:
- raise TokenIDConverterError(
- f"Must be 1 dim ndarray, but got {integers.ndim}")
- return [self.token_list[i] for i in integers]
-
- def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
- token2id = {v: i for i, v in enumerate(self.token_list)}
- if self.unk_symbol not in token2id:
- raise TokenIDConverterError(
- f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
- )
- unk_id = token2id[self.unk_symbol]
- return [token2id.get(i, unk_id) for i in tokens]
-
-
-class CharTokenizer():
- def __init__(
- self,
- symbol_value: Union[Path, str, Iterable[str]] = None,
- space_symbol: str = "<space>",
- remove_non_linguistic_symbols: bool = False,
- ):
- check_argument_types()
-
- self.space_symbol = space_symbol
- self.non_linguistic_symbols = self.load_symbols(symbol_value)
- self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
-
- @staticmethod
- def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
- if value is None:
- return set()
-
- if isinstance(value, Iterable[str]):
- return set(value)
-
- file_path = Path(value)
- if not file_path.exists():
- logging.warning("%s doesn't exist.", file_path)
- return set()
-
- with file_path.open("r", encoding="utf-8") as f:
- return set(line.rstrip() for line in f)
-
- def text2tokens(self, line: Union[str, list]) -> List[str]:
- tokens = []
- while len(line) != 0:
- for w in self.non_linguistic_symbols:
- if line.startswith(w):
- if not self.remove_non_linguistic_symbols:
- tokens.append(line[: len(w)])
- line = line[len(w):]
- break
- else:
- t = line[0]
- if t == " ":
- t = "<space>"
- tokens.append(t)
- line = line[1:]
- return tokens
-
- def tokens2text(self, tokens: Iterable[str]) -> str:
- tokens = [t if t != self.space_symbol else " " for t in tokens]
- return "".join(tokens)
-
- def __repr__(self):
- return (
- f"{self.__class__.__name__}("
- f'space_symbol="{self.space_symbol}"'
- f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
- f")"
- )
-
-
-
-class Hypothesis(NamedTuple):
- """Hypothesis data type."""
-
- yseq: np.ndarray
- score: Union[float, np.ndarray] = 0
- scores: Dict[str, Union[float, np.ndarray]] = dict()
- states: Dict[str, Any] = dict()
-
- def asdict(self) -> dict:
- """Convert data to JSON-friendly dict."""
- return self._replace(
- yseq=self.yseq.tolist(),
- score=float(self.score),
- scores={k: float(v) for k, v in self.scores.items()},
- )._asdict()
-
-
-class TokenIDConverterError(Exception):
- pass
-
-
-class ONNXRuntimeError(Exception):
- pass
-
-
-class OrtInferSession():
- def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
- device_id = str(device_id)
- sess_opt = SessionOptions()
- sess_opt.intra_op_num_threads = intra_op_num_threads
- sess_opt.log_severity_level = 4
- sess_opt.enable_cpu_mem_arena = False
- sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
-
- cuda_ep = 'CUDAExecutionProvider'
- cuda_provider_options = {
- "device_id": device_id,
- "arena_extend_strategy": "kNextPowerOfTwo",
- "cudnn_conv_algo_search": "EXHAUSTIVE",
- "do_copy_in_default_stream": "true",
- }
- cpu_ep = 'CPUExecutionProvider'
- cpu_provider_options = {
- "arena_extend_strategy": "kSameAsRequested",
- }
-
- EP_list = []
- if device_id != "-1" and get_device() == 'GPU' \
- and cuda_ep in get_available_providers():
- EP_list = [(cuda_ep, cuda_provider_options)]
- EP_list.append((cpu_ep, cpu_provider_options))
-
- self._verify_model(model_file)
- self.session = InferenceSession(model_file,
- sess_options=sess_opt,
- providers=EP_list)
-
- if device_id != "-1" and cuda_ep not in self.session.get_providers():
- warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
- 'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
- 'you can check their relations from the offical web site: '
- 'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
- RuntimeWarning)
-
- def __call__(self,
- input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
- input_dict = dict(zip(self.get_input_names(), input_content))
- try:
- return self.session.run(None, input_dict)
- except Exception as e:
- raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
-
- def get_input_names(self, ):
- return [v.name for v in self.session.get_inputs()]
-
- def get_output_names(self,):
- return [v.name for v in self.session.get_outputs()]
-
- def get_character_list(self, key: str = 'character'):
- return self.meta_dict[key].splitlines()
-
- def have_key(self, key: str = 'character') -> bool:
- self.meta_dict = self.session.get_modelmeta().custom_metadata_map
- if key in self.meta_dict.keys():
- return True
- return False
-
- @staticmethod
- def _verify_model(model_path):
- model_path = Path(model_path)
- if not model_path.exists():
- raise FileNotFoundError(f'{model_path} does not exists.')
- if not model_path.is_file():
- raise FileExistsError(f'{model_path} is not a file.')
-
-
-def read_yaml(yaml_path: Union[str, Path]) -> Dict:
- if not Path(yaml_path).exists():
- raise FileExistsError(f'The {yaml_path} does not exist.')
-
- with open(str(yaml_path), 'rb') as f:
- data = yaml.load(f, Loader=yaml.Loader)
- return data
-
-
-@functools.lru_cache()
-def get_logger(name='rapdi_paraformer'):
- """Initialize and get a logger by name.
- If the logger has not been initialized, this method will initialize the
- logger by adding one or two handlers, otherwise the initialized logger will
- be directly returned. During initialization, a StreamHandler will always be
- added.
- Args:
- name (str): Logger name.
- Returns:
- logging.Logger: The expected logger.
- """
- logger = logging.getLogger(name)
- if name in logger_initialized:
- return logger
-
- for logger_name in logger_initialized:
- if name.startswith(logger_name):
- return logger
-
- formatter = logging.Formatter(
- '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
- datefmt="%Y/%m/%d %H:%M:%S")
-
- sh = logging.StreamHandler()
- sh.setFormatter(formatter)
- logger.addHandler(sh)
- logger_initialized[name] = True
- logger.propagate = False
- return logger
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py
deleted file mode 100644
index 58913bb..0000000
--- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# -*- encoding: utf-8 -*-
-
-import os.path
-from pathlib import Path
-from typing import List, Union, Tuple
-
-import copy
-import librosa
-import numpy as np
-
-from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
- OrtInferSession, TokenIDConverter, get_logger,
- read_yaml)
-from .utils.postprocess_utils import sentence_postprocess
-from .utils.frontend import WavFrontend
-from .utils.timestamp_utils import time_stamp_lfr6_onnx
-from .utils.e2e_vad import E2EVadModel
-
-logging = get_logger()
-
-
-class Fsmn_vad():
- def __init__(self, model_dir: Union[str, Path] = None,
- batch_size: int = 1,
- device_id: Union[str, int] = "-1",
- quantize: bool = False,
- intra_op_num_threads: int = 4,
- max_end_sil: int = 800,
- ):
-
- if not Path(model_dir).exists():
- raise FileNotFoundError(f'{model_dir} does not exist.')
-
- model_file = os.path.join(model_dir, 'model.onnx')
- if quantize:
- model_file = os.path.join(model_dir, 'model_quant.onnx')
- config_file = os.path.join(model_dir, 'vad.yaml')
- cmvn_file = os.path.join(model_dir, 'vad.mvn')
- config = read_yaml(config_file)
-
- self.frontend = WavFrontend(
- cmvn_file=cmvn_file,
- **config['frontend_conf']
- )
- self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
- self.batch_size = batch_size
- self.vad_scorer = E2EVadModel(**config)
- self.max_end_sil = max_end_sil
-
- def prepare_cache(self, in_cache: list = []):
- if len(in_cache) > 0:
- return in_cache
-
- for i in range(4):
- cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
- in_cache.append(cache)
- return in_cache
-
-
- def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
- waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
- waveform_nums = len(waveform_list)
- is_final = kwargs.get('kwargs', False)
-
- asr_res = []
- for beg_idx in range(0, waveform_nums, self.batch_size):
-
- end_idx = min(waveform_nums, beg_idx + self.batch_size)
- waveform = waveform_list[beg_idx:end_idx]
- feats, feats_len = self.extract_feat(waveform)
- param_dict = kwargs.get('param_dict', dict())
- in_cache = param_dict.get('cache', list())
- in_cache = self.prepare_cache(in_cache)
- try:
-
- scores, out_caches = self.infer(feats, *in_cache)
- param_dict['cache'] = out_caches
- segments = self.vad_scorer(scores, waveform, is_final=is_final, max_end_sil=self.max_end_sil)
-
- except ONNXRuntimeError:
- # logging.warning(traceback.format_exc())
- logging.warning("input wav is silence or noise")
- segments = ''
- asr_res.append(segments)
- # else:
- # preds = self.decode(am_scores, valid_token_lens)
- #
- # asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
-
- return asr_res
-
- def load_data(self,
- wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
- def load_wav(path: str) -> np.ndarray:
- waveform, _ = librosa.load(path, sr=fs)
- return waveform
-
- if isinstance(wav_content, np.ndarray):
- return [wav_content]
-
- if isinstance(wav_content, str):
- return [load_wav(wav_content)]
-
- if isinstance(wav_content, list):
- return [load_wav(path) for path in wav_content]
-
- raise TypeError(
- f'The type of {wav_content} is not in [str, np.ndarray, list]')
-
- def extract_feat(self,
- waveform_list: List[np.ndarray]
- ) -> Tuple[np.ndarray, np.ndarray]:
- feats, feats_len = [], []
- for waveform in waveform_list:
- speech, _ = self.frontend.fbank(waveform)
- feat, feat_len = self.frontend.lfr_cmvn(speech)
- feats.append(feat)
- feats_len.append(feat_len)
-
- feats = self.pad_feats(feats, np.max(feats_len))
- feats_len = np.array(feats_len).astype(np.int32)
- return feats, feats_len
-
- @staticmethod
- def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
- def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
- pad_width = ((0, max_feat_len - cur_len), (0, 0))
- return np.pad(feat, pad_width, 'constant', constant_values=0)
-
- feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
- feats = np.array(feat_res).astype(np.float32)
- return feats
-
- def infer(self, feats: np.ndarray,
- feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
- outputs = self.ort_infer([feats, feats_len])
- return outputs
-
- def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
- return [self.decode_one(am_score, token_num)
- for am_score, token_num in zip(am_scores, token_nums)]
-
- def decode_one(self,
- am_score: np.ndarray,
- valid_token_num: int) -> List[str]:
- yseq = am_score.argmax(axis=-1)
- score = am_score.max(axis=-1)
- score = np.sum(score, axis=-1)
-
- # pad with mask tokens to ensure compatibility with sos/eos tokens
- # asr_model.sos:1 asr_model.eos:2
- yseq = np.array([1] + yseq.tolist() + [2])
- hyp = Hypothesis(yseq=yseq, score=score)
-
- # remove sos/eos and get results
- last_pos = -1
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x not in (0, 2), token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
- token = token[:valid_token_num - self.pred_bias]
- # texts = sentence_postprocess(token)
- return token
diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg
deleted file mode 100644
index b24107b..0000000
--- a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg
deleted file mode 100644
index a7ccaf5..0000000
--- a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg
+++ /dev/null
Binary files differ
--
Gitblit v1.9.1