Merge pull request #129 from alibaba-damo-academy/dev_zly
Dev zly
old mode 100755
new mode 100644
| File was renamed from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py |
| | |
| | | audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' |
| | | output_dir = None |
| | | inference_pipline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | task=Tasks.voice_activity_detection, |
| | | model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | model_revision=None, |
| | | output_dir=output_dir, |
copy from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md
copy to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
old mode 100755
new mode 100644
copy from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py
copy to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
| File was copied from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py |
| | |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | if __name__ == '__main__': |
| | | audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' |
| | | audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav' |
| | | output_dir = None |
| | | inference_pipline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | model_revision=None, |
| | | output_dir=output_dir, |
| | | task=Tasks.voice_activity_detection, |
| | | model="damo/speech_fsmn_vad_zh-cn-8k-common", |
| | | model_revision='v1.1.1', |
| | | output_dir='./output_dir', |
| | | batch_size=1, |
| | | ) |
| | | segments_result = inference_pipline(audio_in=audio_in) |
| | |
| | | if asr_train_args.encoder_conf["input_layer"] == "conv2d": |
| | | self.encoder_downsampling_factor = 4 |
| | | |
| | | |
| | | |
| | | @torch.no_grad() |
| | | def __call__( |
| | | self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, begin_time: int = 0, end_time: int = None, |
| | | self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, |
| | | begin_time: int = 0, end_time: int = None, |
| | | ): |
| | | """Inference |
| | | |
| | |
| | | enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor |
| | | |
| | | predictor_outs = self.asr_model.calc_predictor(enc, enc_len) |
| | | pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3] |
| | | pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ |
| | | predictor_outs[2], predictor_outs[3] |
| | | pre_token_length = pre_token_length.round().long() |
| | | if torch.max(pre_token_length) < 1: |
| | | return [] |
| | |
| | | else: |
| | | text = None |
| | | |
| | | |
| | | timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time) |
| | | results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor)) |
| | | |
| | | |
| | | # assert check_return_type(results) |
| | | return results |
| | | |
| | | |
| | | class Speech2VadSegment: |
| | | """Speech2VadSegment class |
| | |
| | | self.device = device |
| | | self.dtype = dtype |
| | | self.frontend = frontend |
| | | self.batch_size = batch_size |
| | | |
| | | @torch.no_grad() |
| | | def __call__( |
| | |
| | | feats_len = feats_len.int() |
| | | else: |
| | | raise Exception("Need to extract feats first, please configure frontend configuration") |
| | | batch = {"feats": feats, "feats_lengths": feats_len, "waveform": speech} |
| | | |
| | | # b. Forward Encoder streaming |
| | | t_offset = 0 |
| | | step = min(feats_len, 6000) |
| | | segments = [[]] * self.batch_size |
| | | for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): |
| | | if t_offset + step >= feats_len - 1: |
| | | step = feats_len - t_offset |
| | | is_final_send = True |
| | | else: |
| | | is_final_send = False |
| | | batch = { |
| | | "feats": feats[:, t_offset:t_offset + step, :], |
| | | "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], |
| | | "is_final_send": is_final_send |
| | | } |
| | | # a. To device |
| | | batch = to_device(batch, device=self.device) |
| | | |
| | | # b. Forward Encoder |
| | | segments = self.vad_model(**batch) |
| | | segments_part = self.vad_model(**batch) |
| | | if segments_part: |
| | | for batch_num in range(0, self.batch_size): |
| | | segments[batch_num] += segments_part[batch_num] |
| | | |
| | | return fbanks, segments |
| | | |
| | | |
| | | |
| | | def inference( |
| | |
| | | punc_model_file: Optional[str] = None, |
| | | **kwargs, |
| | | ): |
| | | |
| | | inference_pipeline = inference_modelscope( |
| | | maxlenratio=maxlenratio, |
| | | minlenratio=minlenratio, |
| | |
| | | **kwargs, |
| | | ) |
| | | return inference_pipeline(data_path_and_name_and_type, raw_inputs) |
| | | |
| | | |
| | | def inference_modelscope( |
| | | maxlenratio: float, |
| | |
| | | if j == 0: |
| | | result_segments = result_cur |
| | | else: |
| | | result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] |
| | | result_segments = [ |
| | | [result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] |
| | | |
| | | key = keys[0] |
| | | result = result_segments[0] |
| | | text, token, token_int = result[0], result[1], result[2] |
| | | time_stamp = None if len(result) < 4 else result[3] |
| | | |
| | | |
| | | if use_timestamp and time_stamp is not None: |
| | | postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) |
| | |
| | | |
| | | logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc)) |
| | | return asr_result_list |
| | | |
| | | return _forward |
| | | |
| | | |
| | | def get_parser(): |
| | | parser = config_argparse.ArgumentParser( |
| | | description="ASR Decoding", |
| | |
| | | self.device = device |
| | | self.dtype = dtype |
| | | self.frontend = frontend |
| | | self.batch_size = batch_size |
| | | |
| | | @torch.no_grad() |
| | | def __call__( |
| | |
| | | feats_len = feats_len.int() |
| | | else: |
| | | raise Exception("Need to extract feats first, please configure frontend configuration") |
| | | # batch = {"feats": feats, "waveform": speech, "is_final_send": True} |
| | | # segments = self.vad_model(**batch) |
| | | |
| | | # b. Forward Encoder sreaming |
| | | segments = [] |
| | | step = 6000 |
| | | # b. Forward Encoder streaming |
| | | t_offset = 0 |
| | | step = min(feats_len, 6000) |
| | | segments = [[]] * self.batch_size |
| | | for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): |
| | | if t_offset + step >= feats_len - 1: |
| | | step = feats_len - t_offset |
| | |
| | | batch = to_device(batch, device=self.device) |
| | | segments_part = self.vad_model(**batch) |
| | | if segments_part: |
| | | segments += segments_part |
| | | #print(segments) |
| | | |
| | | for batch_num in range(0, self.batch_size): |
| | | segments[batch_num] += segments_part[batch_num] |
| | | return segments |
| | | |
| | | |
| | |
| | | assert all(isinstance(s, str) for s in keys), keys |
| | | _bs = len(next(iter(batch.values()))) |
| | | assert len(keys) == _bs, f"{len(keys)} != {_bs}" |
| | | # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} |
| | | |
| | | # do vad segment |
| | | results = speech2vadsegment(**batch) |
| | |
| | | |
| | | |
| | | class E2EVadModel(nn.Module): |
| | | def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], streaming=False): |
| | | def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]): |
| | | super(E2EVadModel, self).__init__() |
| | | self.vad_opts = VADXOptions(**vad_post_args) |
| | | self.windows_detector = WindowDetector(self.vad_opts.window_size_ms, |
| | |
| | | self.data_buf = None |
| | | self.data_buf_all = None |
| | | self.waveform = None |
| | | self.streaming = streaming |
| | | self.ResetDetection() |
| | | |
| | | def AllResetDetection(self): |
| | |
| | | if not is_final_send: |
| | | self.DetectCommonFrames() |
| | | else: |
| | | if self.streaming: |
| | | self.DetectLastFrames() |
| | | else: |
| | | self.AllResetDetection() |
| | | self.DetectAllFrames() # offline decode and is_final_send == True |
| | | segments = [] |
| | | for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now |
| | | segment_batch = [] |
| | |
| | | self.output_data_buf_offset += 1 # need update this parameter |
| | | if segment_batch: |
| | | segments.append(segment_batch) |
| | | |
| | | if is_final_send: |
| | | self.AllResetDetection() |
| | | return segments |
| | | |
| | | def DetectCommonFrames(self) -> int: |
| | |
| | | else: |
| | | self.DetectOneFrame(frame_state, self.frm_cnt - 1, True) |
| | | |
| | | return 0 |
| | | |
| | | def DetectAllFrames(self) -> int: |
| | | if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: |
| | | return 0 |
| | | if self.vad_opts.nn_eval_block_size != self.vad_opts.dcd_block_size: |
| | | frame_state = FrameState.kFrameStateInvalid |
| | | for t in range(0, self.frm_cnt): |
| | | frame_state = self.GetFrameState(t) |
| | | self.DetectOneFrame(frame_state, t, t == self.frm_cnt - 1) |
| | | else: |
| | | pass |
| | | return 0 |
| | | |
| | | def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None: |
| | |
| | | model_class = model_choices.get_class(args.model) |
| | | except AttributeError: |
| | | model_class = model_choices.get_class("e2evad") |
| | | model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf, |
| | | streaming=args.encoder_conf.get('streaming', False)) |
| | | model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf) |
| | | |
| | | return model |
| | | |