zhifu gao
2024-04-24 861147c7308b91068ffa02724fdf74ee623a909e
funasr/models/lcbnet/model.py
@@ -15,6 +15,7 @@
from funasr.models.ctc.ctc import CTC
from funasr.models.transformer.utils.add_sos_eos import add_sos_eos
from funasr.metrics.compute_acc import th_accuracy
# from funasr.models.e2e_asr_common import ErrorCalculator
from funasr.train_utils.device_funcs import force_gatherable
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
@@ -23,6 +24,8 @@
from funasr.register import tables
import pdb
@tables.register("model_classes", "LCBNet")
class LCBNet(nn.Module):
    """
@@ -93,7 +96,6 @@
        bias_predictor_class = tables.encoder_classes.get(bias_predictor)
        bias_predictor = bias_predictor_class(**bias_predictor_conf)
        if decoder is not None:
            decoder_class = tables.decoder_classes.get(decoder)
            decoder = decoder_class(
@@ -106,9 +108,7 @@
            if ctc_conf is None:
                ctc_conf = {}
            
            ctc = CTC(
                odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf
            )
            ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
    
        self.blank_id = blank_id
        self.sos = vocab_size - 1
@@ -230,9 +230,7 @@
            loss_interctc = loss_interctc / len(intermediate_outs)
            
            # calculate whole encoder loss
            loss_ctc = (
                           1 - self.interctc_weight
                       ) * loss_ctc + self.interctc_weight * loss_interctc
            loss_ctc = (1 - self.interctc_weight) * loss_ctc + self.interctc_weight * loss_interctc
        
        # decoder: Attention decoder branch
        loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
@@ -262,9 +260,11 @@
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
    
    def encode(
        self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs,
        self,
        speech: torch.Tensor,
        speech_lengths: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Frontend + Encoder. Note that this method is used by asr_inference.py
        Args:
@@ -283,9 +283,7 @@
        # feats: (Batch, Length, Dim)
        # -> encoder_out: (Batch, Length2, Dim2)
        if self.encoder.interctc_use_conditioning:
            encoder_out, encoder_out_lens, _ = self.encoder(
                speech, speech_lengths, ctc=self.ctc
            )
            encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths, ctc=self.ctc)
        else:
            encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths)
        intermediate_outs = None
@@ -308,9 +306,7 @@
        ys_in_lens = ys_pad_lens + 1
        
        # 1. Forward decoder
        decoder_out, _ = self.decoder(
            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
        )
        decoder_out, _ = self.decoder(encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens)
        
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_out_pad)
@@ -346,7 +342,8 @@
            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
        return loss_ctc, cer_ctc
    
    def init_beam_search(self,
    def init_beam_search(
        self,
                         **kwargs,
                         ):
        from funasr.models.transformer.search import BeamSearch
@@ -358,15 +355,12 @@
        
        if self.ctc != None:
            ctc = CTCPrefixScorer(ctc=self.ctc, eos=self.eos)
            scorers.update(
                ctc=ctc
            )
            scorers.update(ctc=ctc)
        token_list = kwargs.get("token_list")
        scorers.update(
            decoder=self.decoder,
            length_bonus=LengthBonus(len(token_list)),
        )
        
        # 3. Build ngram model
        # ngram is not supported now
@@ -393,7 +387,8 @@
        self.beam_search = beam_search
        
    def inference(self,
    def inference(
        self,
             data_in,
             data_lengths=None,
             key: list=None,
@@ -412,7 +407,9 @@
            self.nbest = kwargs.get("nbest", 1)
        meta_data = {}
        if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
        if (
            isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
        ):  # fbank
            speech, speech_lengths = data_in, data_lengths
            if len(speech.shape) < 3:
                speech = speech[None, :, :]
@@ -421,9 +418,13 @@
        else:
            # extract fbank feats
            time1 = time.perf_counter()
            sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
            sample_list = load_audio_text_image_video(
                data_in,
                fs=frontend.fs,
                audio_fs=kwargs.get("fs", 16000),
                                                            data_type=kwargs.get("data_type", "sound"),
                                                            tokenizer=tokenizer)
                tokenizer=tokenizer,
            )
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            audio_sample_list = sample_list[0]
@@ -431,8 +432,9 @@
                ocr_sample_list = sample_list[1]
            else:
                ocr_sample_list = [[294, 0]]
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
                                                   frontend=frontend)
            speech, speech_lengths = extract_fbank(
                audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
            )
            time3 = time.perf_counter()
            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
            frame_shift = 10 
@@ -447,13 +449,17 @@
        ocr_list_new = [[x + 1 if x != 0 else x for x in sublist] for sublist in ocr_sample_list]
        ocr = torch.tensor(ocr_list_new).to(device=kwargs["device"])
        ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(device=kwargs["device"])
        ocr_lengths = ocr.new_full([1], dtype=torch.long, fill_value=ocr.size(1)).to(
            device=kwargs["device"]
        )
        ocr, ocr_lens, _ = self.text_encoder(ocr, ocr_lengths)
        fusion_out, _, _, _ = self.fusion_encoder(encoder_out,None, ocr, None)
        encoder_out = encoder_out + fusion_out
        # c. Passed the encoder result and the beam search
        nbest_hyps = self.beam_search(
            x=encoder_out[0], maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
            x=encoder_out[0],
            maxlenratio=kwargs.get("maxlenratio", 0.0),
            minlenratio=kwargs.get("minlenratio", 0.0),
        )
        
        nbest_hyps = nbest_hyps[: self.nbest]
@@ -477,7 +483,11 @@
                    token_int = hyp.yseq[1:last_pos].tolist()
                    
                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
                token_int = list(
                    filter(
                        lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int
                    )
                )
                
                # Change integer-ids to tokens
                token = tokenizer.ids2tokens(token_int)
@@ -492,4 +502,3 @@
                    ibest_writer["text"][key[i]] = text_postprocessed
        
        return results, meta_data