python/FunASR-XL.git

			@@ -21,6 +21,7 @@
			from funasr.utils.timestamp_tools import timestamp_sentence
			from funasr.download.download_from_hub import download_model
			from funasr.utils.vad_utils import slice_padding_audio_samples
			from funasr.utils.vad_utils import merge_vad
			from funasr.utils.load_utils import load_audio_text_image_video
			from funasr.train_utils.set_all_random_seed import set_all_random_seed
			from funasr.train_utils.load_pretrained_model import load_pretrained_model
			@@ -295,6 +296,10 @@
			res = self.inference(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg)
			end_vad = time.time()

			# FIX(gcf): concat the vad clips for sense vocie model for better aed
			if kwargs.get("merge_vad", False):
			for i in range(len(res)):
			res[i]['value'] = merge_vad(res[i]['value'], kwargs.get("merge_length", 15000))

			# step.2 compute asr model
			model = self.model

			@@ -119,9 +119,9 @@
			suppress_blank: bool = True # this will suppress blank outputs

			gain_event: bool = False # this will suppress blank outputs
			gain_tokens_bg: Optional[Union[str, List[int]]] = "<\|Applause\|><\|Laughter\|>"
			gain_tokens_ed: Optional[Union[str, List[int]]] = "<\|/Applause\|><\|/Laughter\|>"
			gain_tokens_score: List[float] = field(default_factory=lambda: [25.0, 5.0]) #[25, 5]
			gain_tokens_bg: Optional[Union[str, List[int]]] = "<\|Speech\|><\|BGM\|><\|Applause\|><\|Laughter\|>"
			gain_tokens_ed: Optional[Union[str, List[int]]] = "<\|/Speech\|><\|/BGM\|><\|/Applause\|><\|/Laughter\|>"
			gain_tokens_score: List[float] = field(default_factory=lambda: [1, 1, 25.0, 5.0]) #[25, 5]

			use_emo_threshold: bool = False # this will suppress blank outputs
			emo_unk_token: Optional[Union[str, List[int]]] = "<\|SPECIAL_TOKEN_1\|>"

			@@ -28,4 +28,27 @@
			speech_list.append(speech_i)
			speech_lengths_list.append(speech_lengths_i)

			return speech_list, speech_lengths_list
			return speech_list, speech_lengths_list

			def merge_vad(vad_result, max_length=15000):
			new_result = []
			time_step = [t[0] for t in vad_result] + [t[1] for t in vad_result]
			time_step = sorted(list(set(time_step)))
			if len(time_step) == 0:
			return []
			bg = 0
			for i in range(len(time_step)-1):
			time = time_step[i]
			if time_step[i+1] - bg < max_length:
			continue
			if time - bg < max_length * 1.5:
			new_result.append([bg, time])
			else:
			split_num = int(time - bg) // max_length + 1
			spl_l = int(time - bg) // split_num
			for j in range(split_num):
			new_result.append([bg + jspl_l, bg + (j+1)spl_l])
			bg = time
			new_result.append([bg, time_step[-1]])
			return new_result

	funasr/auto/auto_model.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/sense_voice/whisper_lib/decoding.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/vad_utils.py	25 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史