| New file |
| | |
| | | # ModelScope Model |
| | | |
| | | ## How to finetune and infer using a pretrained ModelScope Model |
| | | |
| | | ### Inference |
| | | |
| | | Or you can use the finetuned model for inference directly. |
| | | |
| | | - Setting parameters in `infer.py` |
| | | - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format. |
| | | - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set. |
| | | |
| | | - Then you can run the pipeline to infer with: |
| | | ```python |
| | | python infer.py |
| | | ``` |
| | | |
| | | |
| | | Modify inference related parameters in vad.yaml. |
| | | |
| | | - max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms |
| | | - speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1) |
| | | - The value tends to -1, the greater probability of noise being judged as speech |
| | | - The value tends to 1, the greater probability of speech being judged as noise |
| New file |
| | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | if __name__ == '__main__': |
| | | audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' |
| | | output_dir = None |
| | | inference_pipline = pipeline( |
| | | task=Tasks.voice_activity_detection, |
| | | model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | model_revision=None, |
| | | output_dir=output_dir, |
| | | batch_size=1, |
| | | ) |
| | | segments_result = inference_pipline(audio_in=audio_in) |
| | | print(segments_result) |
| New file |
| | |
| | | # ModelScope Model |
| | | |
| | | ## How to finetune and infer using a pretrained ModelScope Model |
| | | |
| | | ### Inference |
| | | |
| | | Or you can use the finetuned model for inference directly. |
| | | |
| | | - Setting parameters in `infer.py` |
| | | - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format. |
| | | - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set. |
| | | |
| | | - Then you can run the pipeline to infer with: |
| | | ```python |
| | | python infer.py |
| | | ``` |
| | | |
| | | |
| | | Modify inference related parameters in vad.yaml. |
| | | |
| | | - max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms |
| | | - speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1) |
| | | - The value tends to -1, the greater probability of noise being judged as speech |
| | | - The value tends to 1, the greater probability of speech being judged as noise |
| New file |
| | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | if __name__ == '__main__': |
| | | audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav' |
| | | output_dir = None |
| | | inference_pipline = pipeline( |
| | | task=Tasks.voice_activity_detection, |
| | | model="damo/speech_fsmn_vad_zh-cn-8k-common", |
| | | model_revision='v1.1.1', |
| | | output_dir='./output_dir', |
| | | batch_size=1, |
| | | ) |
| | | segments_result = inference_pipline(audio_in=audio_in) |
| | | print(segments_result) |
| | |
| | | self.device = device |
| | | self.dtype = dtype |
| | | self.frontend = frontend |
| | | self.batch_size = batch_size |
| | | |
| | | @torch.no_grad() |
| | | def __call__( |
| | |
| | | # segments = self.vad_model(**batch) |
| | | |
| | | # b. Forward Encoder sreaming |
| | | segments = [] |
| | | segments_tmp = [] |
| | | step = 6000 |
| | | t_offset = 0 |
| | | step = min(feats_len, 6000) |
| | | segments = [[]] * self.batch_size |
| | | for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): |
| | | if t_offset + step >= feats_len - 1: |
| | | step = feats_len - t_offset |
| | |
| | | batch = to_device(batch, device=self.device) |
| | | segments_part = self.vad_model(**batch) |
| | | if segments_part: |
| | | segments_tmp += segments_part[0] |
| | | segments.append(segments_tmp) |
| | | for batch_num in range(0, self.batch_size): |
| | | segments[batch_num] += segments_part[batch_num] |
| | | return segments |
| | | |
| | | |
| | |
| | | assert all(isinstance(s, str) for s in keys), keys |
| | | _bs = len(next(iter(batch.values()))) |
| | | assert len(keys) == _bs, f"{len(keys)} != {_bs}" |
| | | # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} |
| | | |
| | | # do vad segment |
| | | results = speech2vadsegment(**batch) |
| | |
| | | |
| | | |
| | | class E2EVadModel(nn.Module): |
| | | def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], streaming=False): |
| | | def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]): |
| | | super(E2EVadModel, self).__init__() |
| | | self.vad_opts = VADXOptions(**vad_post_args) |
| | | self.windows_detector = WindowDetector(self.vad_opts.window_size_ms, |
| | |
| | | self.data_buf = None |
| | | self.data_buf_all = None |
| | | self.waveform = None |
| | | self.streaming = streaming |
| | | self.ResetDetection() |
| | | |
| | | def AllResetDetection(self): |
| | |
| | | if not is_final_send: |
| | | self.DetectCommonFrames() |
| | | else: |
| | | if self.streaming: |
| | | self.DetectLastFrames() |
| | | else: |
| | | self.AllResetDetection() |
| | | self.DetectAllFrames() # offline decode and is_final_send == True |
| | | self.DetectLastFrames() |
| | | segments = [] |
| | | for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now |
| | | segment_batch = [] |
| | |
| | | self.output_data_buf_offset += 1 # need update this parameter |
| | | if segment_batch: |
| | | segments.append(segment_batch) |
| | | |
| | | if is_final_send: |
| | | self.AllResetDetection() |
| | | return segments |
| | | |
| | | def DetectCommonFrames(self) -> int: |
| | |
| | | else: |
| | | self.DetectOneFrame(frame_state, self.frm_cnt - 1, True) |
| | | |
| | | return 0 |
| | | |
| | | def DetectAllFrames(self) -> int: |
| | | if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: |
| | | return 0 |
| | | if self.vad_opts.nn_eval_block_size != self.vad_opts.dcd_block_size: |
| | | frame_state = FrameState.kFrameStateInvalid |
| | | for t in range(0, self.frm_cnt): |
| | | frame_state = self.GetFrameState(t) |
| | | self.DetectOneFrame(frame_state, t, t == self.frm_cnt - 1) |
| | | else: |
| | | pass |
| | | return 0 |
| | | |
| | | def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None: |
| | |
| | | model_class = model_choices.get_class(args.model) |
| | | except AttributeError: |
| | | model_class = model_choices.get_class("e2evad") |
| | | model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf, |
| | | streaming=args.encoder_conf.get('streaming', False)) |
| | | model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf) |
| | | |
| | | return model |
| | | |