| | |
| | | from funasr.models.ctc.ctc import CTC |
| | | from funasr.models.transformer.utils.add_sos_eos import add_sos_eos |
| | | from funasr.metrics.compute_acc import th_accuracy |
| | | |
| | | # from funasr.models.e2e_asr_common import ErrorCalculator |
| | | from funasr.train_utils.device_funcs import force_gatherable |
| | | from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank |
| | |
| | | from funasr.register import tables |
| | | |
| | | import pdb |
| | | |
| | | |
| | | @tables.register("model_classes", "LCBNet") |
| | | class LCBNet(nn.Module): |
| | | """ |
| | |
| | | bias_predictor_class = tables.encoder_classes.get(bias_predictor) |
| | | bias_predictor = bias_predictor_class(**bias_predictor_conf) |
| | | |
| | | |
| | | if decoder is not None: |
| | | decoder_class = tables.decoder_classes.get(decoder) |
| | | decoder = decoder_class( |
| | |
| | | if ctc_conf is None: |
| | | ctc_conf = {} |
| | | |
| | | ctc = CTC( |
| | | odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf |
| | | ) |
| | | ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf) |
| | | |
| | | self.blank_id = blank_id |
| | | self.sos = vocab_size - 1 |
| | |
| | | loss_interctc = loss_interctc / len(intermediate_outs) |
| | | |
| | | # calculate whole encoder loss |
| | | loss_ctc = ( |
| | | 1 - self.interctc_weight |
| | | ) * loss_ctc + self.interctc_weight * loss_interctc |
| | | loss_ctc = (1 - self.interctc_weight) * loss_ctc + self.interctc_weight * loss_interctc |
| | | |
| | | # decoder: Attention decoder branch |
| | | loss_att, acc_att, cer_att, wer_att = self._calc_att_loss( |
| | |
| | | loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device) |
| | | return loss, stats, weight |
| | | |
| | | |
| | | def encode( |
| | | self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs, |
| | | self, |
| | | speech: torch.Tensor, |
| | | speech_lengths: torch.Tensor, |
| | | **kwargs, |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| | | Args: |
| | |
| | | # feats: (Batch, Length, Dim) |
| | | # -> encoder_out: (Batch, Length2, Dim2) |
| | | if self.encoder.interctc_use_conditioning: |
| | | encoder_out, encoder_out_lens, _ = self.encoder( |
| | | speech, speech_lengths, ctc=self.ctc |
| | | ) |
| | | encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths, ctc=self.ctc) |
| | | else: |
| | | encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) |
| | | intermediate_outs = None |
| | |
| | | ys_in_lens = ys_pad_lens + 1 |
| | | |
| | | # 1. Forward decoder |
| | | decoder_out, _ = self.decoder( |
| | | encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens |
| | | ) |
| | | decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens) |
| | | |
| | | # 2. Compute attention loss |
| | | loss_att = self.criterion_att(decoder_out, ys_out_pad) |
| | |
| | | cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True) |
| | | return loss_ctc, cer_ctc |
| | | |
| | | def init_beam_search(self, |
| | | def init_beam_search( |
| | | self, |
| | | **kwargs, |
| | | ): |
| | | from funasr.models.transformer.search import BeamSearch |
| | |
| | | |
| | | if self.ctc != None: |
| | | ctc = CTCPrefixScorer(ctc=self.ctc, eos=self.eos) |
| | | scorers.update( |
| | | ctc=ctc |
| | | ) |
| | | scorers.update(ctc=ctc) |
| | | token_list = kwargs.get("token_list") |
| | | scorers.update( |
| | | decoder=self.decoder, |
| | | length_bonus=LengthBonus(len(token_list)), |
| | | ) |
| | | |
| | | |
| | | # 3. Build ngram model |
| | | # ngram is not supported now |
| | |
| | | |
| | | self.beam_search = beam_search |
| | | |
| | | def inference(self, |
| | | def inference( |
| | | self, |
| | | data_in, |
| | | data_lengths=None, |
| | | key: list=None, |
| | |
| | | self.nbest = kwargs.get("nbest", 1) |
| | | |
| | | meta_data = {} |
| | | if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank": # fbank |
| | | if ( |
| | | isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank" |
| | | ): # fbank |
| | | speech, speech_lengths = data_in, data_lengths |
| | | if len(speech.shape) < 3: |
| | | speech = speech[None, :, :] |
| | |
| | | else: |
| | | # extract fbank feats |
| | | time1 = time.perf_counter() |
| | | sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), |
| | | sample_list = load_audio_text_image_video( |
| | | data_in, |
| | | fs=frontend.fs, |
| | | audio_fs=kwargs.get("fs", 16000), |
| | | data_type=kwargs.get("data_type", "sound"), |
| | | tokenizer=tokenizer) |
| | | tokenizer=tokenizer, |
| | | ) |
| | | time2 = time.perf_counter() |
| | | meta_data["load_data"] = f"{time2 - time1:0.3f}" |
| | | audio_sample_list = sample_list[0] |
| | |
| | | ocr_sample_list = sample_list[1] |
| | | else: |
| | | ocr_sample_list = [[294, 0]] |
| | | speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), |
| | | frontend=frontend) |
| | | speech, speech_lengths = extract_fbank( |
| | | audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend |
| | | ) |
| | | time3 = time.perf_counter() |
| | | meta_data["extract_feat"] = f"{time3 - time2:0.3f}" |
| | | frame_shift = 10 |
| | |
| | | |
| | | ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list] |
| | | ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"]) |
| | | ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"]) |
| | | ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to( |
| | | device=kwargs["device"] |
| | | ) |
| | | ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths) |
| | | fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None) |
| | | encoder_out = encoder_out + fusion_out |
| | | # c. Passed the encoder result and the beam search |
| | | nbest_hyps = self.beam_search( |
| | | x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0) |
| | | x=encoder_out[0], |
| | | maxlenratio=kwargs.get("maxlenratio", 0.0), |
| | | minlenratio=kwargs.get("minlenratio", 0.0), |
| | | ) |
| | | |
| | | nbest_hyps = nbest_hyps[: self.nbest] |
| | |
| | | token_int = hyp.yseq[1:last_pos].tolist() |
| | | |
| | | # remove blank symbol id, which is assumed to be 0 |
| | | token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int)) |
| | | token_int = list( |
| | | filter( |
| | | lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int |
| | | ) |
| | | ) |
| | | |
| | | # Change integer-ids to tokens |
| | | token = tokenizer.ids2tokens(token_int) |
| | |
| | | ibest_writer["text"][key[i]] = text_postprocessed |
| | | |
| | | return results, meta_data |
| | | |