| | |
| | | encoder_class = tables.encoder_classes.get(encoder) |
| | | encoder = encoder_class(**encoder_conf) |
| | | self.encoder = encoder |
| | | self.encoder_conf = encoder_conf |
| | | |
| | | def ResetDetection(self, cache: dict = {}): |
| | | cache["stats"].continous_silence_frame_count = 0 |
| | |
| | | |
| | | return frame_state |
| | | |
| | | def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {}, |
| | | is_final: bool = False |
| | | def forward(self, feats: torch.Tensor, |
| | | waveform: torch.tensor, |
| | | cache: dict = {}, |
| | | is_final: bool = False, |
| | | **kwargs, |
| | | ): |
| | | # if len(cache) == 0: |
| | | # self.AllResetDetection() |
| | | # self.waveform = waveform # compute decibel for each frame |
| | | cache["stats"].waveform = waveform |
| | | is_streaming_input = kwargs.get("is_streaming_input", True) |
| | | self.ComputeDecibel(cache=cache) |
| | | self.ComputeScores(feats, cache=cache) |
| | | if not is_final: |
| | |
| | | segment_batch = [] |
| | | if len(cache["stats"].output_data_buf) > 0: |
| | | for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)): |
| | | if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not |
| | | cache["stats"].output_data_buf[ |
| | | i].contain_seg_end_point): |
| | | continue |
| | | segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms] |
| | | if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end] |
| | | if not cache["stats"].output_data_buf[i].contain_seg_start_point: |
| | | continue |
| | | if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point: |
| | | continue |
| | | start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1 |
| | | if cache["stats"].output_data_buf[i].contain_seg_end_point: |
| | | end_ms = cache["stats"].output_data_buf[i].end_ms |
| | | cache["stats"].next_seg = True |
| | | cache["stats"].output_data_buf_offset += 1 |
| | | else: |
| | | end_ms = -1 |
| | | cache["stats"].next_seg = False |
| | | segment = [start_ms, end_ms] |
| | | |
| | | else: # in this case, return [beg, end] |
| | | |
| | | if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not |
| | | cache["stats"].output_data_buf[ |
| | | i].contain_seg_end_point): |
| | | continue |
| | | segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms] |
| | | cache["stats"].output_data_buf_offset += 1 # need update this parameter |
| | | |
| | | segment_batch.append(segment) |
| | | cache["stats"].output_data_buf_offset += 1 # need update this parameter |
| | | |
| | | if segment_batch: |
| | | segments.append(segment_batch) |
| | | # if is_final: |
| | |
| | | chunk_stride_samples = int(chunk_size * frontend.fs / 1000) |
| | | |
| | | time1 = time.perf_counter() |
| | | cfg = {"is_final": kwargs.get("is_final", False)} |
| | | is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True) |
| | | is_final = kwargs.get("is_final", False) if is_streaming_input else kwargs.get("is_final", True) |
| | | cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input} |
| | | audio_sample_list = load_audio_text_image_video(data_in, |
| | | fs=frontend.fs, |
| | | audio_fs=kwargs.get("fs", 16000), |
| | |
| | | cache=cfg, |
| | | ) |
| | | _is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True |
| | | |
| | | is_streaming_input = cfg["is_streaming_input"] |
| | | time2 = time.perf_counter() |
| | | meta_data["load_data"] = f"{time2 - time1:0.3f}" |
| | | assert len(audio_sample_list) == 1, "batch_size must be set 1" |
| | |
| | | "feats": speech, |
| | | "waveform": cache["frontend"]["waveforms"], |
| | | "is_final": kwargs["is_final"], |
| | | "cache": cache |
| | | "cache": cache, |
| | | "is_streaming_input": is_streaming_input |
| | | } |
| | | segments_i = self.forward(**batch) |
| | | if len(segments_i) > 0: |
| | |
| | | self.init_cache(cache) |
| | | |
| | | ibest_writer = None |
| | | if ibest_writer is None and kwargs.get("output_dir") is not None: |
| | | writer = DatadirWriter(kwargs.get("output_dir")) |
| | | ibest_writer = writer[f"{1}best_recog"] |
| | | if kwargs.get("output_dir") is not None: |
| | | if not hasattr(self, "writer"): |
| | | self.writer = DatadirWriter(kwargs.get("output_dir")) |
| | | ibest_writer = self.writer[f"{1}best_recog"] |
| | | |
| | | results = [] |
| | | result_i = {"key": key[0], "value": segments} |
| | |
| | | |
| | | return results, meta_data |
| | | |
| | | def export(self, **kwargs): |
| | | is_onnx = kwargs.get("type", "onnx") == "onnx" |
| | | encoder_class = tables.encoder_classes.get(kwargs["encoder"] + "Export") |
| | | self.encoder = encoder_class(self.encoder, onnx=is_onnx) |
| | | self.forward = self._export_forward |
| | | |
| | | return self |
| | | |
| | | def export_forward(self, feats: torch.Tensor, *args, **kwargs): |
| | | |
| | | scores, out_caches = self.encoder(feats, *args) |
| | | |
| | | return scores, out_caches |
| | | |
| | | def export_dummy_inputs(self, data_in=None, frame=30): |
| | | if data_in is None: |
| | | speech = torch.randn(1, frame, self.encoder_conf.get("input_dim")) |
| | | else: |
| | | speech = None # Undo |
| | | |
| | | cache_frames = self.encoder_conf.get("lorder") + self.encoder_conf.get("rorder") - 1 |
| | | in_cache0 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache1 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache2 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache3 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | |
| | | return (speech, in_cache0, in_cache1, in_cache2, in_cache3) |
| | | |
| | | def export_input_names(self): |
| | | return ['speech', 'in_cache0', 'in_cache1', 'in_cache2', 'in_cache3'] |
| | | |
| | | def export_output_names(self): |
| | | return ['logits', 'out_cache0', 'out_cache1', 'out_cache2', 'out_cache3'] |
| | | |
| | | def export_dynamic_axes(self): |
| | | return { |
| | | 'speech': { |
| | | 1: 'feats_length' |
| | | }, |
| | | } |
| | | |
| | | def export_name(self, ): |
| | | return "model.onnx" |
| | | |
| | | def DetectCommonFrames(self, cache: dict = {}) -> int: |
| | | if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: |
| | | return 0 |