| | |
| | | cache["frontend"] = {} |
| | | cache["prev_samples"] = torch.empty(0) |
| | | cache["encoder"] = {} |
| | | |
| | | if kwargs.get("max_end_silence_time") is not None: |
| | | # update the max_end_silence_time |
| | | self.vad_opts.max_end_silence_time = kwargs.get("max_end_silence_time") |
| | | |
| | | windows_detector = WindowDetector(self.vad_opts.window_size_ms, |
| | | self.vad_opts.sil_to_speech_time_thres, |
| | | self.vad_opts.speech_to_sil_time_thres, |
| | |
| | | |
| | | results = [] |
| | | result_i = {"key": key[0], "value": segments} |
| | | if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": |
| | | result_i = json.dumps(result_i) |
| | | # if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": |
| | | # result_i = json.dumps(result_i) |
| | | |
| | | results.append(result_i) |
| | | |
| | |
| | | return results, meta_data |
| | | |
| | | def export(self, **kwargs): |
| | | is_onnx = kwargs.get("type", "onnx") == "onnx" |
| | | encoder_class = tables.encoder_classes.get(kwargs["encoder"] + "Export") |
| | | self.encoder = encoder_class(self.encoder, onnx=is_onnx) |
| | | self.forward = self._export_forward |
| | | |
| | | return self |
| | | |
| | | def export_forward(self, feats: torch.Tensor, *args, **kwargs): |
| | | |
| | | scores, out_caches = self.encoder(feats, *args) |
| | | |
| | | return scores, out_caches |
| | | |
| | | def export_dummy_inputs(self, data_in=None, frame=30): |
| | | if data_in is None: |
| | | speech = torch.randn(1, frame, self.encoder_conf.get("input_dim")) |
| | | else: |
| | | speech = None # Undo |
| | | |
| | | cache_frames = self.encoder_conf.get("lorder") + self.encoder_conf.get("rorder") - 1 |
| | | in_cache0 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache1 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache2 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | in_cache3 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1) |
| | | |
| | | return (speech, in_cache0, in_cache1, in_cache2, in_cache3) |
| | | |
| | | def export_input_names(self): |
| | | return ['speech', 'in_cache0', 'in_cache1', 'in_cache2', 'in_cache3'] |
| | | |
| | | def export_output_names(self): |
| | | return ['logits', 'out_cache0', 'out_cache1', 'out_cache2', 'out_cache3'] |
| | | |
| | | def export_dynamic_axes(self): |
| | | return { |
| | | 'speech': { |
| | | 1: 'feats_length' |
| | | }, |
| | | } |
| | | |
| | | def export_name(self, ): |
| | | return "model.onnx" |
| | | |
| | | |
| | | from .export_meta import export_rebuild_model |
| | | models = export_rebuild_model(model=self, **kwargs) |
| | | return models |
| | | |
| | | def DetectCommonFrames(self, cache: dict = {}) -> int: |
| | | if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: |
| | | return 0 |