| | |
| | | |
| | | super().__init__(*args, **kwargs) |
| | | |
| | | # import pdb; |
| | | # pdb.set_trace() |
| | | self.sampling_ratio = kwargs.get("sampling_ratio", 0.2) |
| | | |
| | | self.scama_mask = None |
| | |
| | | text: (Batch, Length) |
| | | text_lengths: (Batch,) |
| | | """ |
| | | # import pdb; |
| | | # pdb.set_trace() |
| | | decoding_ind = kwargs.get("decoding_ind") |
| | | if len(text_lengths.size()) > 1: |
| | | text_lengths = text_lengths[:, 0] |
| | |
| | | mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( |
| | | None, device=encoder_out.device, batch_size=encoder_out.size(0) |
| | | ) |
| | | mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( |
| | | mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( |
| | | None, device=encoder_out.device, batch_size=encoder_out.size(0) |
| | | ) |
| | | encoder_out = encoder_out * mask_shfit_chunk |
| | | encoder_out = encoder_out * mask_shift_chunk |
| | | pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor( |
| | | encoder_out, |
| | | ys_pad, |
| | |
| | | mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor( |
| | | None, device=encoder_out.device, batch_size=encoder_out.size(0) |
| | | ) |
| | | mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk( |
| | | mask_shift_chunk = self.encoder.overlap_chunk_cls.get_mask_shift_chunk( |
| | | None, device=encoder_out.device, batch_size=encoder_out.size(0) |
| | | ) |
| | | encoder_out = encoder_out * mask_shfit_chunk |
| | | encoder_out = encoder_out * mask_shift_chunk |
| | | pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index = self.predictor( |
| | | encoder_out, |
| | | None, |