游雁
2023-03-24 f5d84f3194cc56dc363678d8018e563f0ce7e9e7
Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add
4个文件已修改
17 ■■■■■ 已修改文件
egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/vad_inference_online.py 10 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/e2e_vad.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 160 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset
egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -22,7 +22,7 @@
    sample_offset = 0
    
    step = 80 * 10
    param_dict = {'in_cache': dict()}
    param_dict = {'in_cache': dict(), 'max_end_sil': 800}
    for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
        if sample_offset + step >= speech_length - 1:
            step = speech_length - sample_offset
funasr/bin/vad_inference_online.py
@@ -30,7 +30,8 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.bin.vad_inference import Speech2VadSegment
header_colors = '\033[95m'
end_colors = '\033[0m'
class Speech2VadSegmentOnline(Speech2VadSegment):
@@ -55,7 +56,7 @@
    @torch.no_grad()
    def __call__(
            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False
            in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
    ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
        """Inference
@@ -86,7 +87,8 @@
                "feats": feats,
                "waveform": waveforms,
                "in_cache": in_cache,
                "is_final": is_final
                "is_final": is_final,
                "max_end_sil": max_end_sil
            }
            # a. To device
            batch = to_device(batch, device=self.device)
@@ -217,6 +219,7 @@
        vad_results = []
        batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
        is_final = param_dict['is_final'] if param_dict is not None else False
        max_end_sil = param_dict['max_end_sil'] if param_dict is not None else 800
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
@@ -224,6 +227,7 @@
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            batch['in_cache'] = batch_in_cache
            batch['is_final'] = is_final
            batch['max_end_sil'] = max_end_sil
            # do vad segment
            _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
funasr/models/e2e_vad.py
old mode 100755 new mode 100644
@@ -473,8 +473,9 @@
        return segments, in_cache
    def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
                is_final: bool = False
                is_final: bool = False, max_end_sil: int = 800
                ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
        self.waveform = waveform  # compute decibel for each frame
        self.ComputeDecibel()
        self.ComputeScores(feats, in_cache)