jmwang66
2022-12-30 5b6bd201412636b6d7fa85afaf24e42cd54e52cd
funasr/bin/asr_inference_paraformer.py
@@ -92,8 +92,8 @@
        if asr_model.frontend is None and frontend_conf is not None:
            frontend = WavFrontend(**frontend_conf)
            asr_model.frontend = frontend
        logging.info("asr_model: {}".format(asr_model))
        logging.info("asr_train_args: {}".format(asr_train_args))
        # logging.info("asr_model: {}".format(asr_model))
        # logging.info("asr_train_args: {}".format(asr_train_args))
        asr_model.to(dtype=getattr(torch, dtype)).eval()
        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
@@ -141,8 +141,8 @@
        for scorer in scorers.values():
            if isinstance(scorer, torch.nn.Module):
                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
        logging.info(f"Beam_search: {beam_search}")
        logging.info(f"Decoding device={device}, dtype={dtype}")
        # logging.info(f"Beam_search: {beam_search}")
        # logging.info(f"Decoding device={device}, dtype={dtype}")
        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
@@ -160,7 +160,7 @@
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")
        # logging.info(f"Text tokenizer: {tokenizer}")
        self.asr_model = asr_model
        self.asr_train_args = asr_train_args
@@ -197,9 +197,9 @@
        # data: (Nsamples,) -> (1, Nsamples)
        # lengths: (1,)
        if len(speech.size()) < 3:
            speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
            speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
        # if len(speech.size()) < 3:
        #     speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
        #     speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
        lfr_factor = max(1, (speech.size()[-1]//80)-1)
        
        batch = {"speech": speech, "speech_lengths": speech_lengths}
@@ -426,7 +426,7 @@
        assert len(keys) == _bs, f"{len(keys)} != {_bs}"
        # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
        logging.info("decoding, utt_id: {}".format(keys))
        # logging.info("decoding, utt_id: {}".format(keys))
        # N-best list of (text, token, token_int, hyp_object)
        time_beg = time.time()