| | |
| | | # 0. sampler |
| | | decoder_out_1st = None |
| | | if self.sampling_ratio > 0.0: |
| | | if self.step_cur < 2: |
| | | logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) |
| | | |
| | | sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, |
| | | pre_acoustic_embeds, contextual_info) |
| | | else: |
| | | if self.step_cur < 2: |
| | | logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) |
| | | sematic_embeds = pre_acoustic_embeds |
| | | |
| | | # 1. Forward decoder |
| | |
| | | self.predictor_bias = predictor_bias |
| | | self.sampling_ratio = sampling_ratio |
| | | self.criterion_pre = mae_loss(normalize_length=length_normalized_loss) |
| | | # self.step_cur = 0 |
| | | # |
| | | |
| | | |
| | | self.share_embedding = share_embedding |
| | | if self.share_embedding: |
| | | self.decoder.embed = None |
| | |
| | | decoder_out_1st = None |
| | | pre_loss_att = None |
| | | if self.sampling_ratio > 0.0: |
| | | if self.step_cur < 2: |
| | | logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) |
| | | |
| | | if self.use_1st_decoder_loss: |
| | | sematic_embeds, decoder_out_1st, pre_loss_att = \ |
| | | self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad, |
| | |
| | | self.sampler(encoder_out, encoder_out_lens, ys_pad, |
| | | ys_pad_lens, pre_acoustic_embeds, scama_mask) |
| | | else: |
| | | if self.step_cur < 2: |
| | | logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio)) |
| | | sematic_embeds = pre_acoustic_embeds |
| | | |
| | | # 1. Forward decoder |
| | |
| | | dha_pad = kwargs.get("dha_pad") |
| | | |
| | | batch_size = speech.shape[0] |
| | | self.step_cur += 1 |
| | | # for data-parallel |
| | | text = text[:, : text_lengths.max()] |
| | | speech = speech[:, :speech_lengths.max()] |