python/FunASR-XL.git

			@@ -22,7 +22,7 @@
			sample_offset = 0

			step = 160 * 10
			param_dict = {'in_cache': dict()}
			param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			if sample_offset + step >= speech_length - 1:
			step = speech_length - sample_offset

			@@ -22,7 +22,7 @@
			sample_offset = 0

			step = 80 * 10
			param_dict = {'in_cache': dict()}
			param_dict = {'in_cache': dict(), 'max_end_sil': 800}
			for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
			if sample_offset + step >= speech_length - 1:
			step = speech_length - sample_offset

			@@ -30,7 +30,8 @@
			from funasr.models.frontend.wav_frontend import WavFrontend
			from funasr.bin.vad_inference import Speech2VadSegment


			header_colors = '\033[95m'
			end_colors = '\033[0m'


			class Speech2VadSegmentOnline(Speech2VadSegment):
			@@ -55,7 +56,7 @@
			@torch.no_grad()
			def __call__(
			self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
			in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
			in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
			) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
			"""Inference

			@@ -86,7 +87,8 @@
			"feats": feats,
			"waveform": waveforms,
			"in_cache": in_cache,
			"is_final": is_final
			"is_final": is_final,
			"max_end_sil": max_end_sil
			}
			# a. To device
			batch = to_device(batch, device=self.device)
			@@ -217,6 +219,7 @@
			vad_results = []
			batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
			is_final = param_dict['is_final'] if param_dict is not None else False
			max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
			for keys, batch in loader:
			assert isinstance(batch, dict), type(batch)
			assert all(isinstance(s, str) for s in keys), keys
			@@ -224,6 +227,7 @@
			assert len(keys) == _bs, f"{len(keys)} != {_bs}"
			batch['in_cache'] = batch_in_cache
			batch['is_final'] = is_final
			batch['max_end_sil'] = max_end_sil

			# do vad segment
			_, results, param_dict['in_cache'] = speech2vadsegment(**batch)

old mode 100755 new mode 100644

			@@ -473,8 +473,9 @@
			return segments, in_cache

			def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
			is_final: bool = False
			is_final: bool = False, max_end_sil: int = 800
			) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
			self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
			self.waveform = waveform # compute decibel for each frame
			self.ComputeDecibel()
			self.ComputeScores(feats, in_cache)

	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/vad_inference_online.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_vad.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史