Yabin Li
2023-06-19 a3c18ddd8ee1a28a2623ca01f0a26bab2ad6ee13
funasr/bin/asr_infer.py
@@ -305,6 +305,7 @@
            nbest: int = 1,
            frontend_conf: dict = None,
            hotword_list_or_file: str = None,
            decoding_ind: int = 0,
            **kwargs,
    ):
        assert check_argument_types()
@@ -415,6 +416,7 @@
        self.nbest = nbest
        self.frontend = frontend
        self.encoder_downsampling_factor = 1
        self.decoding_ind = decoding_ind
        if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
            self.encoder_downsampling_factor = 4
@@ -452,7 +454,7 @@
        batch = to_device(batch, device=self.device)
        # b. Forward Encoder
        enc, enc_len = self.asr_model.encode(**batch)
        enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
        if isinstance(enc, tuple):
            enc = enc[0]
        # assert len(enc) == 1, len(enc)
@@ -491,9 +493,9 @@
            else:
                if pre_token_length[i] == 0:
                    yseq = torch.tensor(
                        [self.asr_model.sos] + [self.asr_model.eos], device=yseq.device
                        [self.asr_model.sos] + [self.asr_model.eos], device=pre_acoustic_embeds.device
                    )
                    score = torch.tensor(0.0, device=yseq.device)
                    score = torch.tensor(0.0, device=pre_acoustic_embeds.device)
                else:
                    yseq = am_scores.argmax(dim=-1)
                    score = am_scores.max(dim=-1)[0]
@@ -1510,8 +1512,13 @@
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)
        
        feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
        if self.frontend is not None:
            speech = torch.unsqueeze(speech, axis=0)
            speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
            feats, feats_lengths = self.frontend(speech, speech_lengths)
        else:
            feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
            feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
        
        if self.asr_model.normalize is not None:
            feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
@@ -1536,14 +1543,19 @@
        
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)
        feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
        feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
        if self.frontend is not None:
            speech = torch.unsqueeze(speech, axis=0)
            speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
            feats, feats_lengths = self.frontend(speech, speech_lengths)
        else:
            feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
            feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
        
        feats = to_device(feats, device=self.device)
        feats_lengths = to_device(feats_lengths, device=self.device)
        
        enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
        enc_out, _, _ = self.asr_model.encoder(feats, feats_lengths)
        
        nbest_hyps = self.beam_search(enc_out[0])
        
@@ -1639,15 +1651,17 @@
        assert check_argument_types()
        
        # 1. Build ASR model
        from funasr.tasks.sa_asr import ASRTask
        from funasr.tasks.asr import ASRTaskSAASR
        scorers = {}
        asr_model, asr_train_args = ASRTask.build_model_from_file(
        asr_model, asr_train_args = ASRTaskSAASR.build_model_from_file(
            asr_train_config, asr_model_file, cmvn_file, device
        )
        frontend = None
        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
            if asr_train_args.frontend == 'wav_frontend':
                frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
            from funasr.tasks.sa_asr import frontend_choices
            if asr_train_args.frontend == 'wav_frontend' or asr_train_args.frontend == "multichannelfrontend":
                frontend_class = frontend_choices.get_class(asr_train_args.frontend)
                frontend = frontend_class(cmvn_file=cmvn_file, **asr_train_args.frontend_conf).eval()
            else:
                frontend_class = frontend_choices.get_class(asr_train_args.frontend)
                frontend = frontend_class(**asr_train_args.frontend_conf).eval()