Dev gcf (#1611)
* 添加默认对Speech和BGM的输出格式约束
* 推理时可以合并vad的切分
* fix
---------
Co-authored-by: 常材 <gaochangfeng.gcf@alibaba-inc.com>
| | |
| | | from funasr.utils.timestamp_tools import timestamp_sentence |
| | | from funasr.download.download_from_hub import download_model |
| | | from funasr.utils.vad_utils import slice_padding_audio_samples |
| | | from funasr.utils.vad_utils import merge_vad |
| | | from funasr.utils.load_utils import load_audio_text_image_video |
| | | from funasr.train_utils.set_all_random_seed import set_all_random_seed |
| | | from funasr.train_utils.load_pretrained_model import load_pretrained_model |
| | |
| | | res = self.inference(input, input_len=input_len, model=self.vad_model, kwargs=self.vad_kwargs, **cfg) |
| | | end_vad = time.time() |
| | | |
| | | # FIX(gcf): concat the vad clips for sense vocie model for better aed |
| | | if kwargs.get("merge_vad", False): |
| | | for i in range(len(res)): |
| | | res[i]['value'] = merge_vad(res[i]['value'], kwargs.get("merge_length", 15000)) |
| | | |
| | | # step.2 compute asr model |
| | | model = self.model |
| | |
| | | suppress_blank: bool = True # this will suppress blank outputs |
| | | |
| | | gain_event: bool = False # this will suppress blank outputs |
| | | gain_tokens_bg: Optional[Union[str, List[int]]] = "<|Applause|><|Laughter|>" |
| | | gain_tokens_ed: Optional[Union[str, List[int]]] = "<|/Applause|><|/Laughter|>" |
| | | gain_tokens_score: List[float] = field(default_factory=lambda: [25.0, 5.0]) #[25, 5] |
| | | gain_tokens_bg: Optional[Union[str, List[int]]] = "<|Speech|><|BGM|><|Applause|><|Laughter|>" |
| | | gain_tokens_ed: Optional[Union[str, List[int]]] = "<|/Speech|><|/BGM|><|/Applause|><|/Laughter|>" |
| | | gain_tokens_score: List[float] = field(default_factory=lambda: [1, 1, 25.0, 5.0]) #[25, 5] |
| | | |
| | | use_emo_threshold: bool = False # this will suppress blank outputs |
| | | emo_unk_token: Optional[Union[str, List[int]]] = "<|SPECIAL_TOKEN_1|>" |
| | |
| | | speech_list.append(speech_i) |
| | | speech_lengths_list.append(speech_lengths_i) |
| | | |
| | | return speech_list, speech_lengths_list |
| | | return speech_list, speech_lengths_list |
| | | |
| | | def merge_vad(vad_result, max_length=15000): |
| | | new_result = [] |
| | | time_step = [t[0] for t in vad_result] + [t[1] for t in vad_result] |
| | | time_step = sorted(list(set(time_step))) |
| | | if len(time_step) == 0: |
| | | return [] |
| | | bg = 0 |
| | | for i in range(len(time_step)-1): |
| | | time = time_step[i] |
| | | if time_step[i+1] - bg < max_length: |
| | | continue |
| | | if time - bg < max_length * 1.5: |
| | | new_result.append([bg, time]) |
| | | else: |
| | | split_num = int(time - bg) // max_length + 1 |
| | | spl_l = int(time - bg) // split_num |
| | | for j in range(split_num): |
| | | new_result.append([bg + j*spl_l, bg + (j+1)*spl_l]) |
| | | bg = time |
| | | new_result.append([bg, time_step[-1]]) |
| | | return new_result |
| | | |