| | |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | | |
| | | |
| | | import pdb |
| | | if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): |
| | | from torch.cuda.amp import autocast |
| | | else: |
| | |
| | | hotword_pad = kwargs.get("hotword_pad") |
| | | hotword_lengths = kwargs.get("hotword_lengths") |
| | | dha_pad = kwargs.get("dha_pad") |
| | | |
| | | |
| | | batch_size = speech.shape[0] |
| | | # for data-parallel |
| | | text = text[:, : text_lengths.max()] |
| | |
| | | nfilter=50, |
| | | seaco_weight=1.0): |
| | | # decoder forward |
| | | |
| | | decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True) |
| | | |
| | | decoder_pred = torch.log_softmax(decoder_out, dim=-1) |
| | | if hw_list is not None: |
| | | hw_lengths = [len(i) for i in hw_list] |
| | | hw_list_ = [torch.Tensor(i).long() for i in hw_list] |
| | | hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device) |
| | | selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device)) |
| | | |
| | | contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) |
| | | num_hot_word = contextual_info.shape[1] |
| | | _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) |
| | | |
| | | |
| | | # ASF Core |
| | | if nfilter > 0 and nfilter < num_hot_word: |
| | | hotword_scores = self.seaco_decoder.forward_asf6(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) |
| | |
| | | cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) |
| | | dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) |
| | | merged = self._merge(cif_attended, dec_attended) |
| | | |
| | | |
| | | dha_output = self.hotword_output_layer(merged) # remove the last token in loss calculation |
| | | dha_pred = torch.log_softmax(dha_output, dim=-1) |
| | | def _merge_res(dec_output, dha_output): |
| | |
| | | # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask) |
| | | logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask) |
| | | return logits |
| | | |
| | | merged_pred = _merge_res(decoder_pred, dha_pred) |
| | | # import pdb; pdb.set_trace() |
| | | return merged_pred |
| | | else: |
| | | return decoder_pred |
| | |
| | | logging.info("enable beam_search") |
| | | self.init_beam_search(**kwargs) |
| | | self.nbest = kwargs.get("nbest", 1) |
| | | |
| | | meta_data = {} |
| | | |
| | | # extract fbank feats |
| | |
| | | if isinstance(encoder_out, tuple): |
| | | encoder_out = encoder_out[0] |
| | | |
| | | |
| | | # predictor |
| | | predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) |
| | | pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \ |
| | |
| | | if torch.max(pre_token_length) < 1: |
| | | return [] |
| | | |
| | | |
| | | decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens, |
| | | pre_acoustic_embeds, |
| | | pre_token_length, |
| | | hw_list=self.hotword_list) |
| | | |
| | | # decoder_out, _ = decoder_outs[0], decoder_outs[1] |
| | | _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens, |
| | | pre_token_length) |
| | | |
| | | results = [] |
| | | b, n, d = decoder_out.size() |
| | | for i in range(b): |