| | |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | | |
| | | |
| | | import pdb |
| | | if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"): |
| | | from torch.cuda.amp import autocast |
| | | else: |
| | |
| | | |
| | | # bias encoder |
| | | if self.bias_encoder_type == 'lstm': |
| | | logging.warning("enable bias encoder sampling and contextual training") |
| | | self.bias_encoder = torch.nn.LSTM(self.inner_dim, |
| | | self.inner_dim, |
| | | 2, |
| | |
| | | self.lstm_proj = None |
| | | self.bias_embed = torch.nn.Embedding(self.vocab_size, self.inner_dim) |
| | | elif self.bias_encoder_type == 'mean': |
| | | logging.warning("enable bias encoder sampling and contextual training") |
| | | self.bias_embed = torch.nn.Embedding(self.vocab_size, self.inner_dim) |
| | | else: |
| | | logging.error("Unsupport bias encoder type: {}".format(self.bias_encoder_type)) |
| | |
| | | hotword_pad = kwargs.get("hotword_pad") |
| | | hotword_lengths = kwargs.get("hotword_lengths") |
| | | dha_pad = kwargs.get("dha_pad") |
| | | |
| | | |
| | | batch_size = speech.shape[0] |
| | | self.step_cur += 1 |
| | | # for data-parallel |
| | |
| | | nfilter=50, |
| | | seaco_weight=1.0): |
| | | # decoder forward |
| | | |
| | | decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True) |
| | | |
| | | decoder_pred = torch.log_softmax(decoder_out, dim=-1) |
| | | if hw_list is not None: |
| | | hw_lengths = [len(i) for i in hw_list] |
| | | hw_list_ = [torch.Tensor(i).long() for i in hw_list] |
| | | hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device) |
| | | selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device)) |
| | | |
| | | contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device) |
| | | num_hot_word = contextual_info.shape[1] |
| | | _contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device) |
| | | |
| | | |
| | | # ASF Core |
| | | if nfilter > 0 and nfilter < num_hot_word: |
| | | for dec in self.seaco_decoder.decoders: |
| | | dec.reserve_attn = True |
| | | |
| | | # cif_attended, _ = self.decoder2(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) |
| | | dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) |
| | | # cif_filter = torch.topk(self.decoder2.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1], min(nfilter, num_hot_word-1))[1].tolist() |
| | | |
| | | hotword_scores = self.seaco_decoder.decoders[-1].attn_mat[0][0].sum(0).sum(0)[:-1] |
| | | # hotword_scores /= torch.sqrt(torch.tensor(hw_lengths)[:-1].float()).to(hotword_scores.device) |
| | | dec_filter = torch.topk(hotword_scores, min(nfilter, num_hot_word-1))[1].tolist() |
| | |
| | | for dec in self.seaco_decoder.decoders: |
| | | dec.attn_mat = [] |
| | | dec.reserve_attn = False |
| | | |
| | | # SeACo Core |
| | | cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens) |
| | | dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens) |
| | | merged = self._merge(cif_attended, dec_attended) |
| | | |
| | | |
| | | dha_output = self.hotword_output_layer(merged) # remove the last token in loss calculation |
| | | dha_pred = torch.log_softmax(dha_output, dim=-1) |
| | | def _merge_res(dec_output, dha_output): |
| | |
| | | # logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask) |
| | | logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask) |
| | | return logits |
| | | |
| | | merged_pred = _merge_res(decoder_pred, dha_pred) |
| | | # import pdb; pdb.set_trace() |
| | | return merged_pred |
| | | else: |
| | | return decoder_pred |
| | |
| | | logging.info("enable beam_search") |
| | | self.init_beam_search(**kwargs) |
| | | self.nbest = kwargs.get("nbest", 1) |
| | | |
| | | meta_data = {} |
| | | |
| | | # extract fbank feats |
| | |
| | | |
| | | speech = speech.to(device=kwargs["device"]) |
| | | speech_lengths = speech_lengths.to(device=kwargs["device"]) |
| | | |
| | | |
| | | # hotword |
| | | self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer, frontend=frontend) |
| | | |
| | |
| | | if isinstance(encoder_out, tuple): |
| | | encoder_out = encoder_out[0] |
| | | |
| | | |
| | | # predictor |
| | | predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens) |
| | | pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \ |
| | |
| | | if torch.max(pre_token_length) < 1: |
| | | return [] |
| | | |
| | | |
| | | decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens, |
| | | pre_acoustic_embeds, |
| | | pre_token_length, |
| | | hw_list=self.hotword_list) |
| | | |
| | | # decoder_out, _ = decoder_outs[0], decoder_outs[1] |
| | | _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens, |
| | | pre_token_length) |
| | | |
| | | results = [] |
| | | b, n, d = decoder_out.size() |
| | | for i in range(b): |
| | |
| | | nbest_hyps = [Hypothesis(yseq=yseq, score=score)] |
| | | for nbest_idx, hyp in enumerate(nbest_hyps): |
| | | ibest_writer = None |
| | | if ibest_writer is None and kwargs.get("output_dir") is not None: |
| | | writer = DatadirWriter(kwargs.get("output_dir")) |
| | | ibest_writer = writer[f"{nbest_idx + 1}best_recog"] |
| | | if kwargs.get("output_dir") is not None: |
| | | if not hasattr(self, "writer"): |
| | | self.writer = DatadirWriter(kwargs.get("output_dir")) |
| | | ibest_writer = self.writer[f"{nbest_idx + 1}best_recog"] |
| | | |
| | | # remove sos/eos and get results |
| | | last_pos = -1 |
| | | if isinstance(hyp.yseq, list): |
| | |
| | | token, timestamp) |
| | | |
| | | result_i = {"key": key[i], "text": text_postprocessed, |
| | | "timestamp": time_stamp_postprocessed, |
| | | "timestamp": time_stamp_postprocessed |
| | | } |
| | | |
| | | if ibest_writer is not None: |
| | | ibest_writer["token"][key[i]] = " ".join(token) |
| | | # ibest_writer["text"][key[i]] = text |
| | | ibest_writer["timestamp"][key[i]] = time_stamp_postprocessed |
| | | ibest_writer["text"][key[i]] = text_postprocessed |
| | | else: |