From 7fe37e0352ca6f8b5937bcda7263a26529723715 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 10 五月 2023 19:17:04 +0800
Subject: [PATCH] Merge pull request #491 from alibaba-damo-academy/main

---
 funasr/bin/asr_inference_paraformer_vad_punc.py | 1231 +++++++++++++++++++++++-----------------------------------
 1 files changed, 488 insertions(+), 743 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 265e054..09b6a0a 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -1,8 +1,14 @@
 #!/usr/bin/env python3
+
+import json
 import argparse
 import logging
 import sys
 import time
+import os
+import codecs
+import tempfile
+import requests
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -12,6 +18,7 @@
 from typing import Any
 from typing import List
 import math
+import copy
 import numpy as np
 import torch
 from typeguard import check_argument_types
@@ -36,568 +43,366 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.tasks.vad import VADTask
-from funasr.utils.timestamp_tools import time_stamp_lfr6
-from funasr.tasks.punctuation import PunctuationTask
-from funasr.torch_utils.forward_adaptor import ForwardAdaptor
-from funasr.datasets.preprocessor import CommonPreprocessor
-from funasr.punctuation.text_preprocessor import split_words, split_to_mini_sentence
-
-header_colors = '\033[95m'
-end_colors = '\033[0m'
-
-global_asr_language: str = 'zh-cn'
-global_sample_rate: Union[int, Dict[Any, int]] = {
-    'audio_fs': 16000,
-    'model_fs': 16000
-}
-
-class Speech2Text:
-    """Speech2Text class
-
-    Examples:
-            >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
-            >>> audio, rate = soundfile.read("speech.wav")
-            >>> speech2text(audio)
-            [(text, token, token_int, hypothesis object), ...]
-
-    """
-
-    def __init__(
-            self,
-            asr_train_config: Union[Path, str] = None,
-            asr_model_file: Union[Path, str] = None,
-            cmvn_file: Union[Path, str] = None,
-            lm_train_config: Union[Path, str] = None,
-            lm_file: Union[Path, str] = None,
-            token_type: str = None,
-            bpemodel: str = None,
-            device: str = "cpu",
-            maxlenratio: float = 0.0,
-            minlenratio: float = 0.0,
-            dtype: str = "float32",
-            beam_size: int = 20,
-            ctc_weight: float = 0.5,
-            lm_weight: float = 1.0,
-            ngram_weight: float = 0.9,
-            penalty: float = 0.0,
-            nbest: int = 1,
-            frontend_conf: dict = None,
-            **kwargs,
-    ):
-        assert check_argument_types()
-
-        # 1. Build ASR model
-        scorers = {}
-        asr_model, asr_train_args = ASRTask.build_model_from_file(
-            asr_train_config, asr_model_file, cmvn_file=cmvn_file, device=device
-        )
-        frontend = None
-        if asr_model.frontend is not None and asr_train_args.frontend_conf is not None:
-            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
-        # logging.info("asr_model: {}".format(asr_model))
-        # logging.info("asr_train_args: {}".format(asr_train_args))
-        asr_model.to(dtype=getattr(torch, dtype)).eval()
-
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
-        token_list = asr_model.token_list
-        scorers.update(
-            ctc=ctc,
-            length_bonus=LengthBonus(len(token_list)),
-        )
-
-        # 2. Build Language model
-        if lm_train_config is not None:
-            lm, lm_train_args = LMTask.build_model_from_file(
-                lm_train_config, lm_file, device
-            )
-            scorers["lm"] = lm.lm
-
-        # 3. Build ngram model
-        # ngram is not supported now
-        ngram = None
-        scorers["ngram"] = ngram
-
-        # 4. Build BeamSearch object
-        # transducer is not supported now
-        beam_search_transducer = None
-
-        weights = dict(
-            decoder=1.0 - ctc_weight,
-            ctc=ctc_weight,
-            lm=lm_weight,
-            ngram=ngram_weight,
-            length_bonus=penalty,
-        )
-        beam_search = BeamSearch(
-            beam_size=beam_size,
-            weights=weights,
-            scorers=scorers,
-            sos=asr_model.sos,
-            eos=asr_model.eos,
-            vocab_size=len(token_list),
-            token_list=token_list,
-            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
-        )
-
-        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
-        for scorer in scorers.values():
-            if isinstance(scorer, torch.nn.Module):
-                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-        
-        logging.info(f"Decoding device={device}, dtype={dtype}")
-
-        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
-        if token_type is None:
-            token_type = asr_train_args.token_type
-        if bpemodel is None:
-            bpemodel = asr_train_args.bpemodel
-
-        if token_type is None:
-            tokenizer = None
-        elif token_type == "bpe":
-            if bpemodel is not None:
-                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
-            else:
-                tokenizer = None
-        else:
-            tokenizer = build_tokenizer(token_type=token_type)
-        converter = TokenIDConverter(token_list=token_list)
-        logging.info(f"Text tokenizer: {tokenizer}")
-
-        self.asr_model = asr_model
-        self.asr_train_args = asr_train_args
-        self.converter = converter
-        self.tokenizer = tokenizer
-        is_use_lm = lm_weight != 0.0 and lm_file is not None
-        if ctc_weight == 0.0 and not is_use_lm:
-            beam_search = None
-        self.beam_search = beam_search
-        logging.info(f"Beam_search: {self.beam_search}")
-        self.beam_search_transducer = beam_search_transducer
-        self.maxlenratio = maxlenratio
-        self.minlenratio = minlenratio
-        self.device = device
-        self.dtype = dtype
-        self.nbest = nbest
-        self.frontend = frontend
-        self.encoder_downsampling_factor = 1
-        if asr_train_args.encoder_conf["input_layer"] == "conv2d":
-            self.encoder_downsampling_factor = 4
-        
-            
-
-    @torch.no_grad()
-    def __call__(
-            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, begin_time: int = 0, end_time: int = None, 
-    ):
-        """Inference
-
-        Args:
-                speech: Input speech data
-        Returns:
-                text, token, token_int, hyp
-
-        """
-        assert check_argument_types()
-
-        # Input as audio signal
-        if isinstance(speech, np.ndarray):
-            speech = torch.tensor(speech)
-
-        if self.frontend is not None:
-            # feats, feats_len = self.frontend.forward(speech, speech_lengths)
-            # fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
-            feats, feats_len = self.frontend.forward_lfr_cmvn(speech, speech_lengths)
-            feats = to_device(feats, device=self.device)
-            feats_len = feats_len.int()
-            self.asr_model.frontend = None
-        else:
-            feats = speech
-            feats_len = speech_lengths
-        lfr_factor = max(1, (feats.size()[-1]//80)-1)
-        batch = {"speech": feats, "speech_lengths": feats_len}
-
-        # a. To device
-        batch = to_device(batch, device=self.device)
-
-        # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode(**batch)
-        if isinstance(enc, tuple):
-            enc = enc[0]
-        # assert len(enc) == 1, len(enc)
-        enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
-
-        predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
-        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
-        pre_token_length = pre_token_length.round().long()
-        decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
-        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
-
-        results = []
-        b, n, d = decoder_out.size()
-        for i in range(b):
-            x = enc[i, :enc_len[i], :]
-            am_scores = decoder_out[i, :pre_token_length[i], :]
-            if self.beam_search is not None:
-                nbest_hyps = self.beam_search(
-                    x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
-                )
-    
-                nbest_hyps = nbest_hyps[: self.nbest]
-            else:
-                yseq = am_scores.argmax(dim=-1)
-                score = am_scores.max(dim=-1)[0]
-                score = torch.sum(score, dim=-1)
-                # pad with mask tokens to ensure compatibility with sos/eos tokens
-                yseq = torch.tensor(
-                    [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
-                )
-                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
-                
-            for hyp in nbest_hyps:
-                assert isinstance(hyp, (Hypothesis)), type(hyp)
-    
-                # remove sos/eos and get results
-                last_pos = -1
-                if isinstance(hyp.yseq, list):
-                    token_int = hyp.yseq[1:last_pos]
-                else:
-                    token_int = hyp.yseq[1:last_pos].tolist()
-    
-                # remove blank symbol id, which is assumed to be 0
-                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
-    
-                # Change integer-ids to tokens
-                token = self.converter.ids2tokens(token_int)
-    
-                if self.tokenizer is not None:
-                    text = self.tokenizer.tokens2text(token)
-                else:
-                    text = None
-
-                time_stamp = time_stamp_lfr6(alphas[i:i+1,], enc_len[i:i+1,], token, begin_time, end_time)
-    
-                results.append((text, token, token_int, time_stamp, enc_len_batch_total, lfr_factor))
-
-        # assert check_return_type(results)
-        return results
-
-class Speech2VadSegment:
-    """Speech2VadSegment class
-
-    Examples:
-        >>> import soundfile
-        >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt")
-        >>> audio, rate = soundfile.read("speech.wav")
-        >>> speech2segment(audio)
-        [[10, 230], [245, 450], ...]
-
-    """
-
-    def __init__(
-            self,
-            vad_infer_config: Union[Path, str] = None,
-            vad_model_file: Union[Path, str] = None,
-            vad_cmvn_file: Union[Path, str] = None,
-            device: str = "cpu",
-            batch_size: int = 1,
-            dtype: str = "float32",
-            **kwargs,
-    ):
-        assert check_argument_types()
-
-        # 1. Build vad model
-        vad_model, vad_infer_args = VADTask.build_model_from_file(
-            vad_infer_config, vad_model_file, device
-        )
-        frontend = None
-        if vad_infer_args.frontend is not None:
-            frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf)
-
-        # logging.info("vad_model: {}".format(vad_model))
-        # logging.info("vad_infer_args: {}".format(vad_infer_args))
-        vad_model.to(dtype=getattr(torch, dtype)).eval()
-
-        self.vad_model = vad_model
-        self.vad_infer_args = vad_infer_args
-        self.device = device
-        self.dtype = dtype
-        self.frontend = frontend
-
-    @torch.no_grad()
-    def __call__(
-            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
-    ) -> List[List[int]]:
-        """Inference
-
-        Args:
-            speech: Input speech data
-        Returns:
-            text, token, token_int, hyp
-
-        """
-        assert check_argument_types()
-
-        # Input as audio signal
-        if isinstance(speech, np.ndarray):
-            speech = torch.tensor(speech)
-
-        if self.frontend is not None:
-            self.frontend.filter_length_max = math.inf
-            fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
-            feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len)
-            fbanks = to_device(fbanks, device=self.device)
-            feats = to_device(feats, device=self.device)
-            feats_len = feats_len.int()
-        else:
-            raise Exception("Need to extract feats first, please configure frontend configuration")
-        batch = {"feats": feats, "feats_lengths": feats_len, "waveform": speech}
-
-        # a. To device
-        batch = to_device(batch, device=self.device)
-
-        # b. Forward Encoder
-        segments = self.vad_model(**batch)
-
-        return fbanks, segments
-
-
-# def inference(
-#         maxlenratio: float,
-#         minlenratio: float,
-#         batch_size: int,
-#         beam_size: int,
-#         ngpu: int,
-#         ctc_weight: float,
-#         lm_weight: float,
-#         penalty: float,
-#         log_level: Union[int, str],
-#         data_path_and_name_and_type,
-#         asr_train_config: Optional[str],
-#         asr_model_file: Optional[str],
-#         cmvn_file: Optional[str] = None,
-#         raw_inputs: Union[np.ndarray, torch.Tensor] = None,
-#         lm_train_config: Optional[str] = None,
-#         lm_file: Optional[str] = None,
-#         token_type: Optional[str] = None,
-#         key_file: Optional[str] = None,
-#         word_lm_train_config: Optional[str] = None,
-#         bpemodel: Optional[str] = None,
-#         allow_variable_data_keys: bool = False,
-#         streaming: bool = False,
-#         output_dir: Optional[str] = None,
-#         dtype: str = "float32",
-#         seed: int = 0,
-#         ngram_weight: float = 0.9,
-#         nbest: int = 1,
-#         num_workers: int = 1,
-#         vad_infer_config: Optional[str] = None,
-#         vad_model_file: Optional[str] = None,
-#         vad_cmvn_file: Optional[str] = None,
-#         time_stamp_writer: bool = False,
-#         punc_infer_config: Optional[str] = None,
-#         punc_model_file: Optional[str] = None,
-#         **kwargs,
-# ):
-#     assert check_argument_types()
+from funasr.bin.vad_inference import Speech2VadSegment
+from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
+from funasr.bin.punctuation_infer import Text2Punc
+from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.utils.vad_utils import slice_padding_fbank
+from funasr.bin.asr_inference_paraformer import Speech2Text
+# class Speech2Text:
+#     """Speech2Text class
 #
-#     if word_lm_train_config is not None:
-#         raise NotImplementedError("Word LM is not implemented")
-#     if ngpu > 1:
-#         raise NotImplementedError("only single GPU decoding is supported")
+#     Examples:
+#             >>> import soundfile
+#             >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
+#             >>> audio, rate = soundfile.read("speech.wav")
+#             >>> speech2text(audio)
+#             [(text, token, token_int, hypothesis object), ...]
 #
-#     logging.basicConfig(
-#         level=log_level,
-#         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-#     )
+#     """
 #
-#     if ngpu >= 1 and torch.cuda.is_available():
-#         device = "cuda"
-#     else:
-#         device = "cpu"
+#     def __init__(
+#             self,
+#             asr_train_config: Union[Path, str] = None,
+#             asr_model_file: Union[Path, str] = None,
+#             cmvn_file: Union[Path, str] = None,
+#             lm_train_config: Union[Path, str] = None,
+#             lm_file: Union[Path, str] = None,
+#             token_type: str = None,
+#             bpemodel: str = None,
+#             device: str = "cpu",
+#             maxlenratio: float = 0.0,
+#             minlenratio: float = 0.0,
+#             dtype: str = "float32",
+#             beam_size: int = 20,
+#             ctc_weight: float = 0.5,
+#             lm_weight: float = 1.0,
+#             ngram_weight: float = 0.9,
+#             penalty: float = 0.0,
+#             nbest: int = 1,
+#             frontend_conf: dict = None,
+#             hotword_list_or_file: str = None,
+#             **kwargs,
+#     ):
+#         assert check_argument_types()
 #
-#     # 1. Set random-seed
-#     set_all_random_seed(seed)
+#         # 1. Build ASR model
+#         scorers = {}
+#         asr_model, asr_train_args = ASRTask.build_model_from_file(
+#             asr_train_config, asr_model_file, cmvn_file=cmvn_file, device=device
+#         )
+#         frontend = None
+#         if asr_model.frontend is not None and asr_train_args.frontend_conf is not None:
+#             frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
 #
-#     # 2. Build speech2vadsegment
-#     speech2vadsegment_kwargs = dict(
-#         vad_infer_config=vad_infer_config,
-#         vad_model_file=vad_model_file,
-#         vad_cmvn_file=vad_cmvn_file,
-#         device=device,
-#         dtype=dtype,
-#     )
-#     # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
-#     speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
+#         # logging.info("asr_model: {}".format(asr_model))
+#         # logging.info("asr_train_args: {}".format(asr_train_args))
+#         asr_model.to(dtype=getattr(torch, dtype)).eval()
 #
-#     # 3. Build speech2text
-#     speech2text_kwargs = dict(
-#         asr_train_config=asr_train_config,
-#         asr_model_file=asr_model_file,
-#         cmvn_file=cmvn_file,
-#         lm_train_config=lm_train_config,
-#         lm_file=lm_file,
-#         token_type=token_type,
-#         bpemodel=bpemodel,
-#         device=device,
-#         maxlenratio=maxlenratio,
-#         minlenratio=minlenratio,
-#         dtype=dtype,
-#         beam_size=beam_size,
-#         ctc_weight=ctc_weight,
-#         lm_weight=lm_weight,
-#         ngram_weight=ngram_weight,
-#         penalty=penalty,
-#         nbest=nbest,
-#         frontend_conf=frontend_conf,
-#     )
-#     speech2text = Speech2Text(**speech2text_kwargs)
+#         if asr_model.ctc != None:
+#             ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+#             scorers.update(
+#                 ctc=ctc
+#             )
+#         token_list = asr_model.token_list
+#         scorers.update(
+#             length_bonus=LengthBonus(len(token_list)),
+#         )
 #
-#     text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
+#         # 2. Build Language model
+#         if lm_train_config is not None:
+#             lm, lm_train_args = LMTask.build_model_from_file(
+#                 lm_train_config, lm_file, device
+#             )
+#             scorers["lm"] = lm.lm
 #
-#     # 3. Build data-iterator
-#     loader = ASRTask.build_streaming_iterator(
-#         data_path_and_name_and_type,
-#         dtype=dtype,
-#         batch_size=1,
-#         key_file=key_file,
-#         num_workers=num_workers,
-#         preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
-#         collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
-#         allow_variable_data_keys=allow_variable_data_keys,
-#         inference=True,
-#     )
+#         # 3. Build ngram model
+#         # ngram is not supported now
+#         ngram = None
+#         scorers["ngram"] = ngram
 #
-#     forward_time_total = 0.0
-#     length_total = 0.0
-#     finish_count = 0
-#     file_count = 1
-#     # 7 .Start for-loop
-#     asr_result_list = []
-#     if output_dir is not None:
-#         writer = DatadirWriter(output_dir)
-#     else:
-#         writer = None
+#         # 4. Build BeamSearch object
+#         # transducer is not supported now
+#         beam_search_transducer = None
 #
-#     for keys, batch in loader:
-#         assert isinstance(batch, dict), type(batch)
-#         assert all(isinstance(s, str) for s in keys), keys
-#         _bs = len(next(iter(batch.values())))
-#         assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-#         # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
+#         weights = dict(
+#             decoder=1.0 - ctc_weight,
+#             ctc=ctc_weight,
+#             lm=lm_weight,
+#             ngram=ngram_weight,
+#             length_bonus=penalty,
+#         )
+#         beam_search = BeamSearch(
+#             beam_size=beam_size,
+#             weights=weights,
+#             scorers=scorers,
+#             sos=asr_model.sos,
+#             eos=asr_model.eos,
+#             vocab_size=len(token_list),
+#             token_list=token_list,
+#             pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+#         )
 #
-#         logging.info("decoding, utt_id: {}".format(keys))
-#         # N-best list of (text, token, token_int, hyp_object)
-#         time_beg = time.time()
-#         vad_results = speech2vadsegment(**batch)
-#         time_end = time.time()
-#         fbanks, vadsegments = vad_results[0], vad_results[1]
-#         for i, segments in enumerate(vadsegments):
-#             result_segments = [["", [], [], ]]
-#             for j, segment_idx in enumerate(segments):
-#                 bed_idx, end_idx = int(segment_idx[0]/10), int(segment_idx[1]/10)
-#                 segment = fbanks[:, bed_idx:end_idx, :].to(device)
-#                 speech_lengths = torch.Tensor([end_idx-bed_idx]).int().to(device)
-#                 batch = {"speech": segment, "speech_lengths": speech_lengths, "begin_time": vadsegments[i][j][0], "end_time": vadsegments[i][j][1]}
-#                 results = speech2text(**batch)
-#                 if len(results) < 1:
-#                     hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-#                     results = [[" ", ["<space>"], [2], 10, 6]] * nbest
-#                 time_end = time.time()
-#                 forward_time = time_end - time_beg
-#                 lfr_factor = results[0][-1]
-#                 length = results[0][-2]
-#                 forward_time_total += forward_time
-#                 length_total += length
-#                 logging.info(
-#                     "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".
-#                         format(length, forward_time, 100 * forward_time / (length*lfr_factor)))
-#                 result_cur = [results[0][:-2]]
-#                 if j == 0:
-#                     result_segments = result_cur
+#         beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+#         for scorer in scorers.values():
+#             if isinstance(scorer, torch.nn.Module):
+#                 scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+#
+#         logging.info(f"Decoding device={device}, dtype={dtype}")
+#
+#         # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+#         if token_type is None:
+#             token_type = asr_train_args.token_type
+#         if bpemodel is None:
+#             bpemodel = asr_train_args.bpemodel
+#
+#         if token_type is None:
+#             tokenizer = None
+#         elif token_type == "bpe":
+#             if bpemodel is not None:
+#                 tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+#             else:
+#                 tokenizer = None
+#         else:
+#             tokenizer = build_tokenizer(token_type=token_type)
+#         converter = TokenIDConverter(token_list=token_list)
+#         logging.info(f"Text tokenizer: {tokenizer}")
+#
+#         self.asr_model = asr_model
+#         self.asr_train_args = asr_train_args
+#         self.converter = converter
+#         self.tokenizer = tokenizer
+#
+#         # 6. [Optional] Build hotword list from str, local file or url
+#         self.hotword_list = None
+#         self.hotword_list = self.generate_hotwords_list(hotword_list_or_file)
+#
+#         is_use_lm = lm_weight != 0.0 and lm_file is not None
+#         if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
+#             beam_search = None
+#         self.beam_search = beam_search
+#         logging.info(f"Beam_search: {self.beam_search}")
+#         self.beam_search_transducer = beam_search_transducer
+#         self.maxlenratio = maxlenratio
+#         self.minlenratio = minlenratio
+#         self.device = device
+#         self.dtype = dtype
+#         self.nbest = nbest
+#         self.frontend = frontend
+#         self.encoder_downsampling_factor = 1
+#         if asr_train_args.encoder_conf["input_layer"] == "conv2d":
+#             self.encoder_downsampling_factor = 4
+#
+#     @torch.no_grad()
+#     def __call__(
+#             self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+#             begin_time: int = 0, end_time: int = None,
+#     ):
+#         """Inference
+#
+#         Args:
+#                 speech: Input speech data
+#         Returns:
+#                 text, token, token_int, hyp
+#
+#         """
+#         assert check_argument_types()
+#
+#         # Input as audio signal
+#         if isinstance(speech, np.ndarray):
+#             speech = torch.tensor(speech)
+#
+#         if self.frontend is not None:
+#             feats, feats_len = self.frontend.forward(speech, speech_lengths)
+#             # fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
+#             # feats, feats_len = self.frontend.forward_lfr_cmvn(speech, speech_lengths)
+#             feats = to_device(feats, device=self.device)
+#             feats_len = feats_len.int()
+#             self.asr_model.frontend = None
+#         else:
+#             feats = speech
+#             feats_len = speech_lengths
+#         lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+#         batch = {"speech": feats, "speech_lengths": feats_len}
+#
+#         # a. To device
+#         batch = to_device(batch, device=self.device)
+#
+#         # b. Forward Encoder
+#         enc, enc_len = self.asr_model.encode(**batch)
+#         if isinstance(enc, tuple):
+#             enc = enc[0]
+#         # assert len(enc) == 1, len(enc)
+#         enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
+#
+#         predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
+#         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
+#                                                                         predictor_outs[2], predictor_outs[3]
+#         pre_token_length = pre_token_length.round().long()
+#         if torch.max(pre_token_length) < 1:
+#             return []
+#
+#         if not isinstance(self.asr_model, ContextualParaformer):
+#             if self.hotword_list:
+#                 logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
+#             decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
+#             decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+#         else:
+#             decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
+#             decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+#
+#         if isinstance(self.asr_model, BiCifParaformer):
+#             _, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len,
+#                                                                                    pre_token_length)  # test no bias cif2
+#
+#         results = []
+#         b, n, d = decoder_out.size()
+#         for i in range(b):
+#             x = enc[i, :enc_len[i], :]
+#             am_scores = decoder_out[i, :pre_token_length[i], :]
+#             if self.beam_search is not None:
+#                 nbest_hyps = self.beam_search(
+#                     x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+#                 )
+#
+#                 nbest_hyps = nbest_hyps[: self.nbest]
+#             else:
+#                 yseq = am_scores.argmax(dim=-1)
+#                 score = am_scores.max(dim=-1)[0]
+#                 score = torch.sum(score, dim=-1)
+#                 # pad with mask tokens to ensure compatibility with sos/eos tokens
+#                 yseq = torch.tensor(
+#                     [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
+#                 )
+#                 nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+#
+#             for hyp in nbest_hyps:
+#                 assert isinstance(hyp, (Hypothesis)), type(hyp)
+#
+#                 # remove sos/eos and get results
+#                 last_pos = -1
+#                 if isinstance(hyp.yseq, list):
+#                     token_int = hyp.yseq[1:last_pos]
 #                 else:
-#                     result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]]
+#                     token_int = hyp.yseq[1:last_pos].tolist()
 #
-#             key = keys[0]
-#             result = result_segments[0]
-#             text, token, token_int, time_stamp = result
+#                 # remove blank symbol id, which is assumed to be 0
+#                 token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+#                 if len(token_int) == 0:
+#                     continue
 #
-#             # Create a directory: outdir/{n}best_recog
-#             if writer is not None:
-#                 ibest_writer = writer[f"1best_recog"]
+#                 # Change integer-ids to tokens
+#                 token = self.converter.ids2tokens(token_int)
 #
-#                 # Write the result to each file
-#                 ibest_writer["token"][key] = " ".join(token)
-#                 ibest_writer["token_int"][key] = " ".join(map(str, token_int))
-#
-#             if text is not None:
-#                 postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
-#                 if len(postprocessed_result) == 3:
-#                     text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1], postprocessed_result[2]
-#                     text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
-#                     text_postprocessed_punc_time_stamp = "predictions: {}  time_stamp: {}".format(text_postprocessed_punc, time_stamp_postprocessed)
+#                 if self.tokenizer is not None:
+#                     text = self.tokenizer.tokens2text(token)
 #                 else:
-#                     text_postprocessed = postprocessed_result
-#                     time_stamp_postprocessed = None
-#                     word_lists = None
-#                     text_postprocessed_punc_time_stamp = None
-#                     punc_id_list = None
+#                     text = None
 #
-#                 item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed, 'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list}
-#                 asr_result_list.append(item)
-#                 finish_count += 1
-#                 # asr_utils.print_progress(finish_count / file_count)
-#                 if writer is not None:
-#                     ibest_writer["text"][key] = text_postprocessed
-#                     if time_stamp_writer and time_stamp_postprocessed is not None:
-#                         ibest_writer["time_stamp"][key] = " ".join(["-".join(map(str, ts)) for ts in time_stamp_postprocessed])
+#                 if isinstance(self.asr_model, BiCifParaformer):
+#                     _, timestamp = ts_prediction_lfr6_standard(us_alphas[i],
+#                                                             us_peaks[i],
+#                                                             copy.copy(token),
+#                                                             vad_offset=begin_time)
+#                     results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor))
+#                 else:
+#                     results.append((text, token, token_int, enc_len_batch_total, lfr_factor))
 #
-#             logging.info("decoding, utt: {}, predictions: {}, time_stamp: {}".format(key, text_postprocessed_punc, time_stamp_postprocessed))
+#         # assert check_return_type(results)
+#         return results
 #
-#     logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
-#                  format(length_total, forward_time_total, 100 * forward_time_total / (length_total*lfr_factor)))
-#     return asr_result_list
+#     def generate_hotwords_list(self, hotword_list_or_file):
+#         # for None
+#         if hotword_list_or_file is None:
+#             hotword_list = None
+#         # for local txt inputs
+#         elif os.path.exists(hotword_list_or_file) and hotword_list_or_file.endswith('.txt'):
+#             logging.info("Attempting to parse hotwords from local txt...")
+#             hotword_list = []
+#             hotword_str_list = []
+#             with codecs.open(hotword_list_or_file, 'r') as fin:
+#                 for line in fin.readlines():
+#                     hw = line.strip()
+#                     hotword_str_list.append(hw)
+#                     hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+#                 hotword_list.append([self.asr_model.sos])
+#                 hotword_str_list.append('<s>')
+#             logging.info("Initialized hotword list from file: {}, hotword list: {}."
+#                          .format(hotword_list_or_file, hotword_str_list))
+#         # for url, download and generate txt
+#         elif hotword_list_or_file.startswith('http'):
+#             logging.info("Attempting to parse hotwords from url...")
+#             work_dir = tempfile.TemporaryDirectory().name
+#             if not os.path.exists(work_dir):
+#                 os.makedirs(work_dir)
+#             text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
+#             local_file = requests.get(hotword_list_or_file)
+#             open(text_file_path, "wb").write(local_file.content)
+#             hotword_list_or_file = text_file_path
+#             hotword_list = []
+#             hotword_str_list = []
+#             with codecs.open(hotword_list_or_file, 'r') as fin:
+#                 for line in fin.readlines():
+#                     hw = line.strip()
+#                     hotword_str_list.append(hw)
+#                     hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+#                 hotword_list.append([self.asr_model.sos])
+#                 hotword_str_list.append('<s>')
+#             logging.info("Initialized hotword list from file: {}, hotword list: {}."
+#                          .format(hotword_list_or_file, hotword_str_list))
+#         # for text str input
+#         elif not hotword_list_or_file.endswith('.txt'):
+#             logging.info("Attempting to parse hotwords as str...")
+#             hotword_list = []
+#             hotword_str_list = []
+#             for hw in hotword_list_or_file.strip().split():
+#                 hotword_str_list.append(hw)
+#                 hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+#             hotword_list.append([self.asr_model.sos])
+#             hotword_str_list.append('<s>')
+#             logging.info("Hotword list: {}.".format(hotword_str_list))
+#         else:
+#             hotword_list = None
+#         return hotword_list
+
 
 def inference(
-    maxlenratio: float,
-    minlenratio: float,
-    batch_size: int,
-    beam_size: int,
-    ngpu: int,
-    ctc_weight: float,
-    lm_weight: float,
-    penalty: float,
-    log_level: Union[int, str],
-    data_path_and_name_and_type,
-    asr_train_config: Optional[str],
-    asr_model_file: Optional[str],
-    cmvn_file: Optional[str] = None,
-    raw_inputs: Union[np.ndarray, torch.Tensor] = None,
-    lm_train_config: Optional[str] = None,
-    lm_file: Optional[str] = None,
-    token_type: Optional[str] = None,
-    key_file: Optional[str] = None,
-    word_lm_train_config: Optional[str] = None,
-    bpemodel: Optional[str] = None,
-    allow_variable_data_keys: bool = False,
-    streaming: bool = False,
-    output_dir: Optional[str] = None,
-    dtype: str = "float32",
-    seed: int = 0,
-    ngram_weight: float = 0.9,
-    nbest: int = 1,
-    num_workers: int = 1,
-    vad_infer_config: Optional[str] = None,
-    vad_model_file: Optional[str] = None,
-    vad_cmvn_file: Optional[str] = None,
-    time_stamp_writer: bool = False,
-    punc_infer_config: Optional[str] = None,
-    punc_model_file: Optional[str] = None,
-    **kwargs,
+        maxlenratio: float,
+        minlenratio: float,
+        batch_size: int,
+        beam_size: int,
+        ngpu: int,
+        ctc_weight: float,
+        lm_weight: float,
+        penalty: float,
+        log_level: Union[int, str],
+        data_path_and_name_and_type,
+        asr_train_config: Optional[str],
+        asr_model_file: Optional[str],
+        cmvn_file: Optional[str] = None,
+        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+        lm_train_config: Optional[str] = None,
+        lm_file: Optional[str] = None,
+        token_type: Optional[str] = None,
+        key_file: Optional[str] = None,
+        word_lm_train_config: Optional[str] = None,
+        bpemodel: Optional[str] = None,
+        allow_variable_data_keys: bool = False,
+        streaming: bool = False,
+        output_dir: Optional[str] = None,
+        dtype: str = "float32",
+        seed: int = 0,
+        ngram_weight: float = 0.9,
+        nbest: int = 1,
+        num_workers: int = 1,
+        vad_infer_config: Optional[str] = None,
+        vad_model_file: Optional[str] = None,
+        vad_cmvn_file: Optional[str] = None,
+        time_stamp_writer: bool = False,
+        punc_infer_config: Optional[str] = None,
+        punc_model_file: Optional[str] = None,
+        **kwargs,
 ):
-
     inference_pipeline = inference_modelscope(
         maxlenratio=maxlenratio,
         minlenratio=minlenratio,
@@ -636,61 +441,71 @@
     )
     return inference_pipeline(data_path_and_name_and_type, raw_inputs)
 
+
 def inference_modelscope(
-    maxlenratio: float,
-    minlenratio: float,
-    batch_size: int,
-    beam_size: int,
-    ngpu: int,
-    ctc_weight: float,
-    lm_weight: float,
-    penalty: float,
-    log_level: Union[int, str],
-    # data_path_and_name_and_type,
-    asr_train_config: Optional[str],
-    asr_model_file: Optional[str],
-    cmvn_file: Optional[str] = None,
-    lm_train_config: Optional[str] = None,
-    lm_file: Optional[str] = None,
-    token_type: Optional[str] = None,
-    key_file: Optional[str] = None,
-    word_lm_train_config: Optional[str] = None,
-    bpemodel: Optional[str] = None,
-    allow_variable_data_keys: bool = False,
-    output_dir: Optional[str] = None,
-    dtype: str = "float32",
-    seed: int = 0,
-    ngram_weight: float = 0.9,
-    nbest: int = 1,
-    num_workers: int = 1,
-    vad_infer_config: Optional[str] = None,
-    vad_model_file: Optional[str] = None,
-    vad_cmvn_file: Optional[str] = None,
-    time_stamp_writer: bool = False,
-    punc_infer_config: Optional[str] = None,
-    punc_model_file: Optional[str] = None,
-    **kwargs,
+        maxlenratio: float,
+        minlenratio: float,
+        batch_size: int,
+        beam_size: int,
+        ngpu: int,
+        ctc_weight: float,
+        lm_weight: float,
+        penalty: float,
+        log_level: Union[int, str],
+        # data_path_and_name_and_type,
+        asr_train_config: Optional[str],
+        asr_model_file: Optional[str],
+        cmvn_file: Optional[str] = None,
+        lm_train_config: Optional[str] = None,
+        lm_file: Optional[str] = None,
+        token_type: Optional[str] = None,
+        key_file: Optional[str] = None,
+        word_lm_train_config: Optional[str] = None,
+        bpemodel: Optional[str] = None,
+        allow_variable_data_keys: bool = False,
+        output_dir: Optional[str] = None,
+        dtype: str = "float32",
+        seed: int = 0,
+        ngram_weight: float = 0.9,
+        nbest: int = 1,
+        num_workers: int = 1,
+        vad_infer_config: Optional[str] = None,
+        vad_model_file: Optional[str] = None,
+        vad_cmvn_file: Optional[str] = None,
+        time_stamp_writer: bool = True,
+        punc_infer_config: Optional[str] = None,
+        punc_model_file: Optional[str] = None,
+        outputs_dict: Optional[bool] = True,
+        param_dict: dict = None,
+        **kwargs,
 ):
     assert check_argument_types()
-    
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
     if ngpu > 1:
         raise NotImplementedError("only single GPU decoding is supported")
-    
+
     logging.basicConfig(
         level=log_level,
         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
     )
-    
+
+    if param_dict is not None:
+        hotword_list_or_file = param_dict.get('hotword')
+    else:
+        hotword_list_or_file = None
+
     if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
     else:
         device = "cpu"
-    
+
     # 1. Set random-seed
     set_all_random_seed(seed)
-    
+
     # 2. Build speech2vadsegment
     speech2vadsegment_kwargs = dict(
         vad_infer_config=vad_infer_config,
@@ -701,7 +516,7 @@
     )
     # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
     speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
-    
+
     # 3. Build speech2text
     speech2text_kwargs = dict(
         asr_train_config=asr_train_config,
@@ -721,15 +536,36 @@
         ngram_weight=ngram_weight,
         penalty=penalty,
         nbest=nbest,
+        hotword_list_or_file=hotword_list_or_file,
     )
     speech2text = Speech2Text(**speech2text_kwargs)
-    
-    text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
-    
+    text2punc = None
+    if punc_model_file is not None:
+        text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
+
+    if output_dir is not None:
+        writer = DatadirWriter(output_dir)
+        ibest_writer = writer[f"1best_recog"]
+        ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
+
     def _forward(data_path_and_name_and_type,
                  raw_inputs: Union[np.ndarray, torch.Tensor] = None,
                  output_dir_v2: Optional[str] = None,
+                 fs: dict = None,
+                 param_dict: dict = None,
+                 **kwargs,
                  ):
+
+        hotword_list_or_file = None
+        if param_dict is not None:
+            hotword_list_or_file = param_dict.get('hotword')
+
+        if 'hotword' in kwargs:
+            hotword_list_or_file = kwargs['hotword']
+
+        if speech2text.hotword_list is None:
+            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
+
         # 3. Build data-iterator
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, torch.Tensor):
@@ -738,6 +574,7 @@
         loader = ASRTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
+            fs=fs,
             batch_size=1,
             key_file=key_file,
             num_workers=num_workers,
@@ -746,195 +583,103 @@
             allow_variable_data_keys=allow_variable_data_keys,
             inference=True,
         )
-        
-        forward_time_total = 0.0
-        length_total = 0.0
+
+        if param_dict is not None:
+            use_timestamp = param_dict.get('use_timestamp', True)
+        else:
+            use_timestamp = True
+
         finish_count = 0
         file_count = 1
+        lfr_factor = 6
         # 7 .Start for-loop
         asr_result_list = []
         output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+        writer = None
         if output_path is not None:
             writer = DatadirWriter(output_path)
-        else:
-            writer = None
-        
+            ibest_writer = writer[f"1best_recog"]
+
         for keys, batch in loader:
             assert isinstance(batch, dict), type(batch)
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))
             assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-            # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
-            
-            logging.info("decoding, utt_id: {}".format(keys))
-            # N-best list of (text, token, token_int, hyp_object)
-            time_beg = time.time()
+
             vad_results = speech2vadsegment(**batch)
-            time_end = time.time()
-            fbanks, vadsegments = vad_results[0], vad_results[1]
+            _, vadsegments = vad_results[0], vad_results[1]
+            speech, speech_lengths = batch["speech"],  batch["speech_lengths"]
             for i, segments in enumerate(vadsegments):
-                result_segments = [["", [], [], ]]
-                for j, segment_idx in enumerate(segments):
-                    bed_idx, end_idx = int(segment_idx[0] / 10), int(segment_idx[1] / 10)
-                    segment = fbanks[:, bed_idx:end_idx, :].to(device)
-                    speech_lengths = torch.Tensor([end_idx - bed_idx]).int().to(device)
-                    batch = {"speech": segment, "speech_lengths": speech_lengths, "begin_time": vadsegments[i][j][0],
-                             "end_time": vadsegments[i][j][1]}
+                result_segments = [["", [], [], []]]
+                # for j, segment_idx in enumerate(segments):
+                for j, beg_idx in enumerate(range(0, len(segments), batch_size)):
+                    end_idx = min(len(segments), beg_idx + batch_size)
+                    speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, segments[beg_idx:end_idx])
+
+                    batch = {"speech": speech_j, "speech_lengths": speech_lengths_j}
+                    batch = to_device(batch, device=device)
                     results = speech2text(**batch)
                     if len(results) < 1:
-                        hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                        results = [[" ", ["<space>"], [2], 10, 6]] * nbest
-                    time_end = time.time()
-                    forward_time = time_end - time_beg
-                    lfr_factor = results[0][-1]
-                    length = results[0][-2]
-                    forward_time_total += forward_time
-                    length_total += length
-                    logging.info(
-                        "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".
-                        format(length, forward_time, 100 * forward_time / (length * lfr_factor)))
+                        continue
+
                     result_cur = [results[0][:-2]]
                     if j == 0:
                         result_segments = result_cur
                     else:
-                        result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]]
-                
+                        result_segments = [
+                            [result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]]
+
                 key = keys[0]
                 result = result_segments[0]
-                text, token, token_int, time_stamp = result
-                
-                # Create a directory: outdir/{n}best_recog
+                text, token, token_int, hyp = result[0], result[1], result[2], result[3]
+                time_stamp = None if len(result) < 5 else result[4]
+
+
+                if use_timestamp and time_stamp is not None: 
+                    postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+                else:
+                    postprocessed_result = postprocess_utils.sentence_postprocess(token)
+                text_postprocessed = ""
+                time_stamp_postprocessed = ""
+                text_postprocessed_punc = postprocessed_result
+                if len(postprocessed_result) == 3:
+                    text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
+                                                                               postprocessed_result[1], \
+                                                                               postprocessed_result[2]
+                else:
+                    text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
+
+                text_postprocessed_punc = text_postprocessed
+                punc_id_list = []
+                if len(word_lists) > 0 and text2punc is not None:
+                    text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
+
+                item = {'key': key, 'value': text_postprocessed_punc}
+                if text_postprocessed != "":
+                    item['text_postprocessed'] = text_postprocessed
+                if time_stamp_postprocessed != "":
+                    item['time_stamp'] = time_stamp_postprocessed
+
+                item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
+
+                asr_result_list.append(item)
+                finish_count += 1
+                # asr_utils.print_progress(finish_count / file_count)
                 if writer is not None:
-                    ibest_writer = writer[f"1best_recog"]
-                    
                     # Write the result to each file
                     ibest_writer["token"][key] = " ".join(token)
-                    # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
-                
-                if text is not None:
-                    postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
-                    if len(postprocessed_result) == 3:
-                        text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
-                                                                                   postprocessed_result[1], \
-                                                                                   postprocessed_result[2]
-                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
-                        text_postprocessed_punc_time_stamp = "predictions: {}  time_stamp: {}".format(
-                            text_postprocessed_punc, time_stamp_postprocessed)
-                    else:
-                        text_postprocessed = postprocessed_result
-                        time_stamp_postprocessed = None
-                        word_lists = None
-                        text_postprocessed_punc_time_stamp = None
-                        punc_id_list = None
-                    
-                    item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed,
-                            'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list}
-                    asr_result_list.append(item)
-                    finish_count += 1
-                    # asr_utils.print_progress(finish_count / file_count)
-                    if writer is not None:
-                        ibest_writer["text"][key] = text_postprocessed
-                        if time_stamp_writer and time_stamp_postprocessed is not None:
-                            ibest_writer["time_stamp"][key] = " ".join(
-                                ["-".join(map(str, ts)) for ts in time_stamp_postprocessed])
-                
-                logging.info("decoding, utt: {}, predictions: {}, time_stamp: {}".format(key, text_postprocessed_punc,
-                                                                                         time_stamp_postprocessed))
-        
-        logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
-                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)))
+                    ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                    ibest_writer["vad"][key] = "{}".format(vadsegments)
+                    ibest_writer["text"][key] = " ".join(word_lists)
+                    ibest_writer["text_with_punc"][key] = text_postprocessed_punc
+                    if time_stamp_postprocessed is not None:
+                        ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
+
+                logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
         return asr_result_list
+
     return _forward
 
-def Text2Punc(
-    train_config: Optional[str],
-    model_file: Optional[str],
-    device: str = "cpu",
-    dtype: str = "float32",
-):
-   
-    # 2. Build Model
-    model, train_args = PunctuationTask.build_model_from_file(
-        train_config, model_file, device)
-    # Wrape model to make model.nll() data-parallel
-    wrapped_model = ForwardAdaptor(model, "inference")
-    wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
-    # logging.info(f"Model:\n{model}")
-    punc_list = train_args.punc_list
-    period = 0
-    for i in range(len(punc_list)):
-        if punc_list[i] == ",":
-            punc_list[i] = "锛�"
-        elif punc_list[i] == "?":
-            punc_list[i] = "锛�"
-        elif punc_list[i] == "銆�":
-            period = i
-    
-    preprocessor = CommonPreprocessor(
-        train=False,
-        token_type="word",
-        token_list=train_args.token_list,
-        bpemodel=train_args.bpemodel,
-        text_cleaner=train_args.cleaner,
-        g2p_type=train_args.g2p,
-        text_name="text",
-        non_linguistic_symbols=train_args.non_linguistic_symbols,
-    )
-
-    print("start decoding!!!")
-    
-    def _forward(words, split_size = 20):
-        cache_sent = []
-        mini_sentences = split_to_mini_sentence(words, split_size)
-        new_mini_sentence = ""
-        new_mini_sentence_punc = ""
-        for mini_sentence_i in range(len(mini_sentences)):
-            mini_sentence = mini_sentences[mini_sentence_i]
-            mini_sentence = cache_sent + mini_sentence
-            data = {"text": " ".join(mini_sentence)}
-            batch = preprocessor(data=data, uid="12938712838719")
-            batch["text_lengths"] = torch.from_numpy(np.array([len(batch["text"])], dtype='int32'))
-            batch["text"] = torch.from_numpy(batch["text"])
-            # Extend one dimension to fake a batch dim.
-            batch["text"] = torch.unsqueeze(batch["text"], 0)
-            batch = to_device(batch, device)
-            y, _ = wrapped_model(**batch)
-            _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
-            punctuations = indices
-            if indices.size()[0] != 1:
-                punctuations = torch.squeeze(indices)
-            assert punctuations.size()[0] == len(mini_sentence)
-            
-            # Search for the last Period/QuestionMark as cache
-            if mini_sentence_i < len(mini_sentences) - 1:
-                sentenceEnd = -1
-                for i in range(len(punctuations) - 2, 1, -1):
-                    if punc_list[punctuations[i]] == "銆�" or punc_list[punctuations[i]] == "锛�":
-                        sentenceEnd = i
-                        break
-                
-                cache_sent = mini_sentence[sentenceEnd + 1:]
-                mini_sentence = mini_sentence[0:sentenceEnd + 1]
-                punctuations = punctuations[0:sentenceEnd + 1]
-    
-            # if len(punctuations) == 0:
-            #    continue
-            
-            punctuations_np = punctuations.cpu().numpy()
-            new_mini_sentence_punc += "".join([str(x) for x in punctuations_np])
-            words_with_punc = []
-            for i in range(len(mini_sentence)):
-                if i > 0:
-                    if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1:
-                        mini_sentence[i] = " " + mini_sentence[i]
-                words_with_punc.append(mini_sentence[i])
-                if punc_list[punctuations[i]] != "_":
-                    words_with_punc.append(punc_list[punctuations[i]])
-            new_mini_sentence += "".join(words_with_punc)
-            
-        return new_mini_sentence, new_mini_sentence_punc
-            
-    return _forward
 
 def get_parser():
     parser = config_argparse.ArgumentParser(

--
Gitblit v1.9.1