python/FunASR-XL.git

parent: 6745487e | 补丁 | 提交 | ignore whitespace

funasr2 paraformer biciparaformer contextuaparaformer

游雁

2023-12-13 806a03609df033d61f824f1ab8527eb88fe837ad

funasr2 paraformer biciparaformer contextuaparaformer

9个文件已修改

19个文件已删除

5个文件已添加

1 文件已复制

1 文件已重命名

	.gitignore	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/paraformer-large/infer.sh	15 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/paraformer-large/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/__init__.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_launch.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_train.py	67 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/inference.py	170 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/train.py	595 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_args.py	122 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_asr_model.py	559 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_dataloader.py	28 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_diar_model.py	326 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_distributed.py	38 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_lm_model.py	62 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_model.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_model_from_file.py	193 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_optimizer.py	28 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_pretrain_model.py	112 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_punc_model.py	68 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_scheduler.py	44 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_ss_model.py	15 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_streaming_iterator.py	65 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_sv_model.py	256 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_trainer.py	812 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/build_utils/build_vad_model.py	81 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/cli/train_cli.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/dataset_jsonl.py	33 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/fun_datasets/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/fun_datasets/load_audio_extract_fbank.py	75 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/frontend/wav_frontend.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/paraformer/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/paraformer/model.py	1760 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/paraformer/search.py	453 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tokenizer/abs_tokenizer.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/download_from_hub.py	24 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 .gitignore

@@ -21,3 +21,4 @@
modelscope
samples
.ipynb_checkpoints
outputs*

 examples/industrial_data_pretraining/paraformer-large/infer.sh

New file
@@ -0,0 +1,15 @@

cmd="funasr/bin/inference.py"

python $cmd \
+model="/Users/zhifu/modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
+input="/Users/zhifu/Downloads/asr_example.wav" \
+output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \
+device="cpu" \
+"hotword='达魔院 魔搭'"

#+input="/Users/zhifu/funasr_github/test_local/asr_example.wav" \
#+input="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len.jsonl" \
#+model="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \

#+model="/Users/zhifu/modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \

 examples/industrial_data_pretraining/paraformer-large/run.sh

@@ -2,7 +2,7 @@
cmd="funasr/cli/train_cli.py"

python $cmd \
+model_pretrain="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+model="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+token_list="/Users/zhifu/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/tokens.txt" \
+train_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len.jsonl" \
+output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \

 funasr/__init__.py

@@ -7,4 +7,4 @@
with open(version_file, "r") as f:
    __version__ = f.read().strip()

from funasr.bin.inference_cli import infer
from funasr.bin.inference import infer

 funasr/bin/asr_inference_launch.py

@@ -1254,37 +1254,6 @@

        return cache

    #def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    #    if len(cache) > 0:
    #        return cache
    #    config = _read_yaml(asr_train_config)
    #    enc_output_size = config["encoder_conf"]["output_size"]
    #    feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
    #    cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
    #                "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
    #                "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
    #    cache["encoder"] = cache_en

    #    cache_de = {"decode_fsmn": None}
    #    cache["decoder"] = cache_de

    #    return cache

    #def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
    #    if len(cache) > 0:
    #        config = _read_yaml(asr_train_config)
    #        enc_output_size = config["encoder_conf"]["output_size"]
    #        feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
    #        cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
    #                    "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
    #                    "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
    #                    "tail_chunk": False}
    #        cache["encoder"] = cache_en

    #        cache_de = {"decode_fsmn": None}
    #        cache["decoder"] = cache_de

    #    return cache

    def _forward(
            data_path_and_name_and_type,

 funasr/bin/asr_train.py

File was deleted

 funasr/bin/inference.py

New file
@@ -0,0 +1,170 @@
import os.path

import torch
import numpy as np
import hydra
import json
from omegaconf import DictConfig, OmegaConf
from funasr.utils.dynamic_import import dynamic_import
import logging
from funasr.utils.download_from_hub import download_model
from funasr.torch_utils.set_all_random_seed import set_all_random_seed
from funasr.tokenizer.funtoken import build_tokenizer
from funasr.datasets.fun_datasets.load_audio_extract_fbank import load_bytes
from funasr.torch_utils.device_funcs import to_device
from tqdm import tqdm
from funasr.torch_utils.load_pretrained_model import load_pretrained_model
import time
import random
import string

@hydra.main(config_name=None, version_base=None)
def main_hydra(kwargs: DictConfig):
    assert "model" in kwargs

    pipeline = infer(**kwargs)
    res = pipeline(input=kwargs["input"])
    print(res)
	
def infer(**kwargs):
	
    if ":" not in kwargs["model"]:
        logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
        kwargs = download_model(**kwargs)
	
    set_all_random_seed(kwargs.get("seed", 0))

	
    device = kwargs.get("device", "cuda")
    if not torch.cuda.is_available() or kwargs.get("ngpu", 1):
        device = "cpu"
        batch_size = 1
    kwargs["device"] = device
	
    # build_tokenizer
    tokenizer = build_tokenizer(
        token_type=kwargs.get("token_type", "char"),
        bpemodel=kwargs.get("bpemodel", None),
        delimiter=kwargs.get("delimiter", None),
        space_symbol=kwargs.get("space_symbol", "<space>"),
        non_linguistic_symbols=kwargs.get("non_linguistic_symbols", None),
        g2p_type=kwargs.get("g2p_type", None),
        token_list=kwargs.get("token_list", None),
        unk_symbol=kwargs.get("unk_symbol", "<unk>"),
    )

    import pdb;
    pdb.set_trace()
    # build model
    model_class = dynamic_import(kwargs.get("model"))
    model = model_class(**kwargs, **kwargs["model_conf"], vocab_size=len(tokenizer.token_list))
    model.eval()
    model.to(device)
    frontend = model.frontend
    kwargs["token_list"] = tokenizer.token_list
	
	
    # init_param
    init_param = kwargs.get("init_param", None)
    if init_param is not None:
        logging.info(f"Loading pretrained params from {init_param}")
        load_pretrained_model(
            model=model,
            init_param=init_param,
            ignore_init_mismatch=kwargs.get("ignore_init_mismatch", False),
            oss_bucket=kwargs.get("oss_bucket", None),
        )
	
    def _forward(input, input_len=None, **cfg):
        cfg = OmegaConf.merge(kwargs, cfg)
        date_type = cfg.get("date_type", "sound")
		
        key_list, data_list = build_iter_for_infer(input, input_len=input_len, date_type=date_type, frontend=frontend)
		
        speed_stats = {}
        asr_result_list = []
        num_samples = len(data_list)
        pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
        for beg_idx in range(0, num_samples, batch_size):

            end_idx = min(num_samples, beg_idx + batch_size)
            data_batch = data_list[beg_idx:end_idx]
            key_batch = key_list[beg_idx:end_idx]
            batch = {"data_in": data_batch, "key": key_batch}
			
            time1 = time.perf_counter()
            results, meta_data = model.generate(**batch, tokenizer=tokenizer, **cfg)
            time2 = time.perf_counter()
			
            asr_result_list.append(results)
            pbar.update(1)
			
            # batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
            batch_data_time = meta_data.get("batch_data_time", -1)
            speed_stats["load_data"] = meta_data["load_data"]
            speed_stats["extract_feat"] = meta_data["extract_feat"]
            speed_stats["forward"] = f"{time2 - time1:0.3f}"
            speed_stats["rtf"] = f"{(time2 - time1)/batch_data_time:0.3f}"
            description = (
                f"{speed_stats}, "
            )
            pbar.set_description(description)
		
        torch.cuda.empty_cache()
        return asr_result_list
	
    return _forward
	

def build_iter_for_infer(data_in, input_len=None, date_type="sound", frontend=None):
    """
	
    :param input:
    :param input_len:
    :param date_type:
    :param frontend:
    :return:
    """
    data_list = []
    key_list = []
    filelist = [".scp", ".txt", ".json", ".jsonl"]
	
    chars = string.ascii_letters + string.digits
	
    if isinstance(data_in, str) and os.path.exists(data_in): # wav_pat; filelist: wav.scp, file.jsonl;text.txt;
        _, file_extension = os.path.splitext(data_in)
        file_extension = file_extension.lower()
        if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt;
            with open(data_in, encoding='utf-8') as fin:
                for line in fin:
                    key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
                    if data_in.endswith(".jsonl"): #file.jsonl: json.dumps({"source": data})
                        lines = json.loads(line.strip())
                        data = lines["source"]
                        key = data["key"] if "key" in data else key
                    else: # filelist, wav.scp, text.txt: id \t data or data
                        lines = line.strip().split()
                        data = lines[1] if len(lines)>1 else lines[0]
                        key = lines[0] if len(lines)>1 else key
					
                    data_list.append(data)
                    key_list.append(key)
        else:
            key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
            data_list = [data_in]
            key_list = [key]
    elif isinstance(data_in, (list, tuple)): # [audio sample point, fbank, wav_path]
        data_list = data_in
        key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))]
    else: # raw text; audio sample point, fbank
        if isinstance(data_in, bytes): # audio bytes
            data_in = load_bytes(data_in)
        key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
        data_list = [data_in]
        key_list = [key]
	
    return key_list, data_list


if __name__ == '__main__':
    main_hydra()

 funasr/bin/train.py

File was deleted

 funasr/build_utils/build_args.py

File was deleted

 funasr/build_utils/build_asr_model.py

File was deleted

 funasr/build_utils/build_dataloader.py

File was deleted

 funasr/build_utils/build_diar_model.py

File was deleted

 funasr/build_utils/build_distributed.py

File was deleted

 funasr/build_utils/build_lm_model.py

File was deleted

 funasr/build_utils/build_model.py

File was deleted

 funasr/build_utils/build_model_from_file.py

File was deleted

 funasr/build_utils/build_optimizer.py

File was deleted

 funasr/build_utils/build_pretrain_model.py

File was deleted

 funasr/build_utils/build_punc_model.py

File was deleted

 funasr/build_utils/build_scheduler.py

File was deleted

 funasr/build_utils/build_ss_model.py

File was deleted

 funasr/build_utils/build_streaming_iterator.py

File was deleted

 funasr/build_utils/build_sv_model.py

File was deleted

 funasr/build_utils/build_trainer.py

File was deleted

 funasr/build_utils/build_vad_model.py

File was deleted

 funasr/cli/train_cli.py

@@ -35,8 +35,9 @@
@hydra.main(config_name=None, version_base=None)
def main_hydra(kwargs: DictConfig):
    import pdb; pdb.set_trace()
    if kwargs.get("model_pretrain"):
        kwargs = download_model(**kwargs)
    if ":" in kwargs["model"]:
        logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
        kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)
    
    import pdb;
    pdb.set_trace()
@@ -84,8 +85,7 @@
    # init_param
    init_param = kwargs.get("init_param", None)
    if init_param is not None:
        init_param = init_param
        if isinstance(init_param, Sequence):
        if not isinstance(init_param, Sequence):
            init_param = (init_param,)
        logging.info("init_param is not None: %s", init_param)
        for p in init_param:

 funasr/datasets/dataset_jsonl.py

@@ -8,33 +8,7 @@
import time
import logging

def load_audio(audio_path: str, fs: int=16000):
    audio = None
    if audio_path.startswith("oss:"):
        pass
    elif audio_path.startswith("odps:"):
        pass
    else:
        if ".ark:" in audio_path:
            audio = kaldiio.load_mat(audio_path)
        else:
            # audio, fs = librosa.load(audio_path, sr=fs)
            audio, fs = torchaudio.load(audio_path)
            audio = audio[0, :]
    return audio

def extract_features(data, date_type: str="sound", frontend=None):
    if date_type == "sound":

        if isinstance(data, np.ndarray):
            data = torch.from_numpy(data).to(torch.float32)
        data_len = torch.tensor([data.shape[0]]).to(torch.int32)
        feat, feats_lens = frontend(data[None, :], data_len)

        feat = feat[0, :, :]
    else:
        feat, feats_lens = torch.from_numpy(data).to(torch.float32), torch.tensor([data.shape[0]]).to(torch.int32)
    return feat, feats_lens
from funasr.datasets.fun_datasets.load_audio_extract_fbank import load_audio, extract_fbank
    
    

@@ -115,17 +89,16 @@
    
    def __getitem__(self, index):
        item = self.indexed_dataset[index]
        # return item

        source = item["source"]
        data_src = load_audio(source, fs=self.fs)
        speech, speech_lengths = extract_features(data_src, self.data_type, self.frontend)
        speech, speech_lengths = extract_fbank(data_src, self.data_type, self.frontend) # speech: [b, T, d]
        target = item["target"]
        ids = self.tokenizer.encode(target)
        ids_lengths = len(ids)
        text, text_lengths = torch.tensor(ids, dtype=torch.int64), torch.tensor([ids_lengths], dtype=torch.int32)

        return {"speech": speech,
        return {"speech": speech[0, :, :],
                "speech_lengths": speech_lengths,
                "text": text,
                "text_lengths": text_lengths,

 funasr/datasets/fun_datasets/__init__.py

copy from funasr/build_utils/__init__.py
copy to funasr/datasets/fun_datasets/__init__.py

 funasr/datasets/fun_datasets/load_audio_extract_fbank.py

New file
@@ -0,0 +1,75 @@
import os
import torch
import json
import torch.distributed as dist
import numpy as np
import kaldiio
import librosa
import torchaudio
import time
import logging
from torch.nn.utils.rnn import pad_sequence

def load_audio(audio_or_path_or_list, fs: int=16000, audio_fs: int=16000):

    if isinstance(audio_or_path_or_list, (list, tuple)):
        return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
	
    if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
        audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)
        audio_or_path_or_list = audio_or_path_or_list[0, :]
    elif isinstance(audio_or_path_or_list, np.ndarray): # audio sample point
        audio_or_path_or_list = np.squeeze(audio_or_path_or_list) #[n_samples,]
		
    if audio_fs != fs:
        resampler = torchaudio.transforms.Resample(audio_fs, fs)
        resampled_waveform = resampler(audio_or_path_or_list[None, :])[0, :]
    return audio_or_path_or_list
#
# def load_audio_from_list(audio_list, fs: int=16000, audio_fs: int=16000):
#     if isinstance(audio_list, (list, tuple)):
#         return [load_audio(audio_or_path, fs=fs, audio_fs=audio_fs) for audio_or_path in audio_list]


def load_bytes(input):
    middle_data = np.frombuffer(input, dtype=np.int16)
    middle_data = np.asarray(middle_data)
    if middle_data.dtype.kind not in 'iu':
        raise TypeError("'middle_data' must be an array of integers")
    dtype = np.dtype('float32')
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")
	
    i = np.iinfo(middle_data.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
    return array

def extract_fbank(data, data_len = None, date_type: str="sound", frontend=None):
	
    if isinstance(data, np.ndarray):
        data = torch.from_numpy(data)
        if len(data) < 2:
            data = data[None, :] # data: [batch, N]
        data_len = [data.shape[1]] if data_len is None else data_len
    elif isinstance(data, torch.Tensor):
        if len(data) < 2:
            data = data[None, :] # data: [batch, N]
        data_len = [data.shape[1]] if data_len is None else data_len
    elif isinstance(data, (list, tuple)):
        data_list, data_len = [], []
        for data_i in data:
            if isinstance(data, np.ndarray):
                data_i = torch.from_numpy(data_i)
            data_list.append(data_i)
            data_len.append(data_i.shape[0])
        data = pad_sequence(data_list, batch_first=True) # data: [batch, N]
    # import pdb;
    # pdb.set_trace()
    if date_type == "sound":
        data, data_len = frontend(data, data_len)
	
    if isinstance(data_len, (list, tuple)):
        data_len = torch.tensor([data_len])
    return data.to(torch.float32), data_len.to(torch.int32)

 funasr/models/frontend/wav_frontend.py

@@ -116,7 +116,7 @@
    def forward(
            self,
            input: torch.Tensor,
            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
            input_lengths) -> Tuple[torch.Tensor, torch.Tensor]:
        batch_size = input.size(0)
        feats = []
        feats_lens = []

 funasr/models/paraformer/__init__.py


 funasr/models/paraformer/model.py

New file
@@ -0,0 +1,1760 @@
import logging
from contextlib import contextmanager
from distutils.version import LooseVersion
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
import tempfile
import codecs
import requests
import re
import copy
import torch
import torch.nn as nn
import random
import numpy as np
import time
# from funasr.layers.abs_normalize import AbsNormalize
from funasr.losses.label_smoothing_loss import (
    LabelSmoothingLoss,  # noqa: H301
)
# from funasr.models.ctc import CTC
# from funasr.models.decoder.abs_decoder import AbsDecoder
# from funasr.models.e2e_asr_common import ErrorCalculator
# from funasr.models.encoder.abs_encoder import AbsEncoder
# from funasr.models.frontend.abs_frontend import AbsFrontend
# from funasr.models.postencoder.abs_postencoder import AbsPostEncoder
from funasr.models.predictor.cif import mae_loss
# from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
# from funasr.models.specaug.abs_specaug import AbsSpecAug
from funasr.modules.add_sos_eos import add_sos_eos
from funasr.modules.nets_utils import make_pad_mask, pad_list
from funasr.modules.nets_utils import th_accuracy
from funasr.torch_utils.device_funcs import force_gatherable
# from funasr.models.base_model import FunASRModel
# from funasr.models.predictor.cif import CifPredictorV3
from funasr.models.paraformer.search import Hypothesis

from funasr.cli.model_class_factory import *

if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
    from torch.cuda.amp import autocast
else:
    # Nothing to do if torch<1.6.0
    @contextmanager
    def autocast(enabled=True):
        yield
from funasr.datasets.fun_datasets.load_audio_extract_fbank import load_audio, extract_fbank
from funasr.utils import postprocess_utils
from funasr.fileio.datadir_writer import DatadirWriter
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard

class Paraformer(nn.Module):
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
	
    def __init__(
        self,
        # token_list: Union[Tuple[str, ...], List[str]],
        frontend: Optional[str] = None,
        frontend_conf: Optional[Dict] = None,
        specaug: Optional[str] = None,
        specaug_conf: Optional[Dict] = None,
        normalize: str = None,
        normalize_conf: Optional[Dict] = None,
        encoder: str = None,
        encoder_conf: Optional[Dict] = None,
        decoder: str = None,
        decoder_conf: Optional[Dict] = None,
        ctc: str = None,
        ctc_conf: Optional[Dict] = None,
        predictor: str = None,
        predictor_conf: Optional[Dict] = None,
        ctc_weight: float = 0.5,
        input_size: int = 80,
        vocab_size: int = -1,
        ignore_id: int = -1,
        blank_id: int = 0,
        sos: int = 1,
        eos: int = 2,
        lsm_weight: float = 0.0,
        length_normalized_loss: bool = False,
        # report_cer: bool = True,
        # report_wer: bool = True,
        # sym_space: str = "<space>",
        # sym_blank: str = "<blank>",
        # extract_feats_in_collect_stats: bool = True,
        # predictor=None,
        predictor_weight: float = 0.0,
        predictor_bias: int = 0,
        sampling_ratio: float = 0.2,
        share_embedding: bool = False,
        # preencoder: Optional[AbsPreEncoder] = None,
        # postencoder: Optional[AbsPostEncoder] = None,
        use_1st_decoder_loss: bool = False,
        **kwargs,
    ):

        super().__init__()
		
        # import pdb;
        # pdb.set_trace()
		
        if frontend is not None:
            frontend_class = frontend_choices.get_class(frontend)
            frontend = frontend_class(**frontend_conf)
        if specaug is not None:
            specaug_class = specaug_choices.get_class(specaug)
            specaug = specaug_class(**specaug_conf)
        if normalize is not None:
            normalize_class = normalize_choices.get_class(normalize)
            normalize = normalize_class(**normalize_conf)
        encoder_class = encoder_choices.get_class(encoder)
        encoder = encoder_class(input_size=input_size, **encoder_conf)
        encoder_output_size = encoder.output_size()
        if decoder is not None:
            decoder_class = decoder_choices.get_class(decoder)
            decoder = decoder_class(
                vocab_size=vocab_size,
                encoder_output_size=encoder_output_size,
                **decoder_conf,
            )
        if ctc_weight > 0.0:
			
            if ctc_conf is None:
                ctc_conf = {}
			
            ctc = CTC(
                odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf
            )
        if predictor is not None:
            predictor_class = predictor_choices.get_class(predictor)
            predictor = predictor_class(**predictor_conf)
		
        # note that eos is the same as sos (equivalent ID)
        self.blank_id = blank_id
        self.sos = sos if sos is not None else vocab_size - 1
        self.eos = eos if eos is not None else vocab_size - 1
        self.vocab_size = vocab_size
        self.ignore_id = ignore_id
        self.ctc_weight = ctc_weight
        # self.token_list = token_list.copy()
        #
        self.frontend = frontend
        self.specaug = specaug
        self.normalize = normalize
        # self.preencoder = preencoder
        # self.postencoder = postencoder
        self.encoder = encoder
        #
        # if not hasattr(self.encoder, "interctc_use_conditioning"):
        #     self.encoder.interctc_use_conditioning = False
        # if self.encoder.interctc_use_conditioning:
        #     self.encoder.conditioning_layer = torch.nn.Linear(
        #         vocab_size, self.encoder.output_size()
        #     )
        #
        # self.error_calculator = None
        #
        if ctc_weight == 1.0:
            self.decoder = None
        else:
            self.decoder = decoder
		
        self.criterion_att = LabelSmoothingLoss(
            size=vocab_size,
            padding_idx=ignore_id,
            smoothing=lsm_weight,
            normalize_length=length_normalized_loss,
        )
        #
        # if report_cer or report_wer:
        #     self.error_calculator = ErrorCalculator(
        #         token_list, sym_space, sym_blank, report_cer, report_wer
        #     )
        #
        if ctc_weight == 0.0:
            self.ctc = None
        else:
            self.ctc = ctc
        #
        # self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
        self.predictor = predictor
        self.predictor_weight = predictor_weight
        self.predictor_bias = predictor_bias
        self.sampling_ratio = sampling_ratio
        self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
        # self.step_cur = 0
        #
        self.share_embedding = share_embedding
        if self.share_embedding:
            self.decoder.embed = None
		
        self.use_1st_decoder_loss = use_1st_decoder_loss
        self.length_normalized_loss = length_normalized_loss
        self.beam_search = None
	
    def forward(
        self,
        speech: torch.Tensor,
        speech_lengths: torch.Tensor,
        text: torch.Tensor,
        text_lengths: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
            speech_lengths = speech_lengths[:, 0]
		
        batch_size = speech.shape[0]
		
		
        # Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)

		
        loss_ctc, cer_ctc = None, None
        loss_pre = None
        stats = dict()
		
        # decoder: CTC branch
        if self.ctc_weight != 0.0:
            loss_ctc, cer_ctc = self._calc_ctc_loss(
                encoder_out, encoder_out_lens, text, text_lengths
            )
			
            # Collect CTC branch stats
            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
            stats["cer_ctc"] = cer_ctc
		

        # decoder: Attention decoder branch
        loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )
		
        # 3. CTC-Att loss definition
        if self.ctc_weight == 0.0:
            loss = loss_att + loss_pre * self.predictor_weight
        else:
            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
		
		
        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
        stats["pre_loss_att"] = pre_loss_att.detach() if pre_loss_att is not None else None
        stats["acc"] = acc_att
        stats["cer"] = cer_att
        stats["wer"] = wer_att
        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
		
        stats["loss"] = torch.clone(loss.detach())
		
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum()
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
	

    def encode(
        self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Frontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        """
        with autocast(False):

            # Data augmentation
            if self.specaug is not None and self.training:
                speech, speech_lengths = self.specaug(speech, speech_lengths)
			
            # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
            if self.normalize is not None:
                speech, speech_lengths = self.normalize(speech, speech_lengths)
		

        # Forward encoder
        encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths)
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]

        return encoder_out, encoder_out_lens
	
    def calc_predictor(self, encoder_out, encoder_out_lens):
		
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None,
                                                                                       encoder_out_mask,
                                                                                       ignore_id=self.ignore_id)
        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
	
    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
		
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
        )
        decoder_out = decoder_outs[0]
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out, ys_pad_lens

    def _calc_att_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor(encoder_out, ys_pad, encoder_out_mask,
                                                                                  ignore_id=self.ignore_id)
		
        # 0. sampler
        decoder_out_1st = None
        pre_loss_att = None
        if self.sampling_ratio > 0.0:

            sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
                                                           pre_acoustic_embeds)
        else:
            sematic_embeds = pre_acoustic_embeds
		
        # 1. Forward decoder
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
        )
        decoder_out, _ = decoder_outs[0], decoder_outs[1]
		
        if decoder_out_1st is None:
            decoder_out_1st = decoder_out
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_pad)
        acc_att = th_accuracy(
            decoder_out_1st.view(-1, self.vocab_size),
            ys_pad,
            ignore_label=self.ignore_id,
        )
        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
		
        # Compute cer/wer using attention-decoder
        if self.training or self.error_calculator is None:
            cer_att, wer_att = None, None
        else:
            ys_hat = decoder_out_1st.argmax(dim=-1)
            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
		
        return loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att
	
    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds):
		
        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
        ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
        if self.share_embedding:
            ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
        else:
            ys_pad_embed = self.decoder.embed(ys_pad_masked)
        with torch.no_grad():
            decoder_outs = self.decoder(
                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens
            )
            decoder_out, _ = decoder_outs[0], decoder_outs[1]
            pred_tokens = decoder_out.argmax(-1)
            nonpad_positions = ys_pad.ne(self.ignore_id)
            seq_lens = (nonpad_positions).sum(1)
            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
            input_mask = torch.ones_like(nonpad_positions)
            bsz, seq_len = ys_pad.size()
            for li in range(bsz):
                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
                if target_num > 0:
                    input_mask[li].scatter_(dim=0,
                                            index=torch.randperm(seq_lens[li])[:target_num].to(input_mask.device),
                                            value=0)
            input_mask = input_mask.eq(1)
            input_mask = input_mask.masked_fill(~nonpad_positions, False)
            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
		
        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
            input_mask_expand_dim, 0)
        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
		
    def _calc_ctc_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
    ):
        # Calc CTC loss
        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
		
        # Calc CER using CTC
        cer_ctc = None
        if not self.training and self.error_calculator is not None:
            ys_hat = self.ctc.argmax(encoder_out).data
            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
        return loss_ctc, cer_ctc

	
    def init_beam_search(self,
                         **kwargs,
                         ):
        from funasr.models.paraformer.search import BeamSearchPara
        from funasr.modules.scorers.ctc import CTCPrefixScorer
        from funasr.modules.scorers.length_bonus import LengthBonus
	
        # 1. Build ASR model
        scorers = {}
		
        if self.ctc != None:
            ctc = CTCPrefixScorer(ctc=self.ctc, eos=self.eos)
            scorers.update(
                ctc=ctc
            )
        token_list = kwargs.get("token_list")
        scorers.update(
            length_bonus=LengthBonus(len(token_list)),
        )

		
        # 3. Build ngram model
        # ngram is not supported now
        ngram = None
        scorers["ngram"] = ngram
		
        weights = dict(
            decoder=1.0 - kwargs.get("decoding_ctc_weight"),
            ctc=kwargs.get("decoding_ctc_weight", 0.0),
            lm=kwargs.get("lm_weight", 0.0),
            ngram=kwargs.get("ngram_weight", 0.0),
            length_bonus=kwargs.get("penalty", 0.0),
        )
        beam_search = BeamSearchPara(
            beam_size=kwargs.get("beam_size", 2),
            weights=weights,
            scorers=scorers,
            sos=self.sos,
            eos=self.eos,
            vocab_size=len(token_list),
            token_list=token_list,
            pre_beam_score_key=None if self.ctc_weight == 1.0 else "full",
        )
        # beam_search.to(device=kwargs.get("device", "cpu"), dtype=getattr(torch, kwargs.get("dtype", "float32"))).eval()
        # for scorer in scorers.values():
        #     if isinstance(scorer, torch.nn.Module):
        #         scorer.to(device=kwargs.get("device", "cpu"), dtype=getattr(torch, kwargs.get("dtype", "float32"))).eval()
        self.beam_search = beam_search
		
    def generate(self,
             data_in: list,
             data_lengths: list=None,
             key: list=None,
             tokenizer=None,
             **kwargs,
             ):
		
        # init beamsearch
        is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
        is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
        if self.beam_search is None and (is_use_lm or is_use_ctc):
            logging.info("enable beam_search")
            self.init_beam_search(**kwargs)
            self.nbest = kwargs.get("nbest", 1)
		
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, date_type=kwargs.get("date_type", "sound"), frontend=self.frontend)
        time3 = time.perf_counter()
        meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
        meta_data["batch_data_time"] = speech_lengths.sum().item() * self.frontend.frame_shift * self.frontend.lfr_n / 1000
		
        speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])

        # Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]
		
        # predictor
        predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens, pre_acoustic_embeds,
                                                                 pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]


        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            x = encoder_out[i, :encoder_out_lens[i], :]
            am_scores = decoder_out[i, :pre_token_length[i], :]
            if self.beam_search is not None:
                nbest_hyps = self.beam_search(
                    x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0), minlenratio=kwargs.get("minlenratio", 0.0)
                )
				
                nbest_hyps = nbest_hyps[: self.nbest]
            else:

                yseq = am_scores.argmax(dim=-1)
                score = am_scores.max(dim=-1)[0]
                score = torch.sum(score, dim=-1)
                # pad with mask tokens to ensure compatibility with sos/eos tokens
                yseq = torch.tensor(
                    [self.sos] + yseq.tolist() + [self.eos], device=yseq.device
                )
                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
            for nbest_idx, hyp in enumerate(nbest_hyps):
                ibest_writer = None
                if ibest_writer is None and kwargs.get("output_dir") is not None:
                    writer = DatadirWriter(kwargs.get("output_dir"))
                    ibest_writer = writer[f"{nbest_idx+1}best_recog"]
                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()
					
                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
				
                # Change integer-ids to tokens
                token = tokenizer.ids2tokens(token_int)
                text = tokenizer.tokens2text(token)
				
                text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                result_i = {"key": key[i], "token": token, "text": text, "text_postprocessed": text_postprocessed}
                results.append(result_i)
				
                if ibest_writer is not None:
                    ibest_writer["token"][key[i]] = " ".join(token)
                    ibest_writer["text"][key[i]] = text
                    ibest_writer["text_postprocessed"][key[i]] = text_postprocessed
		
        return results, meta_data



class BiCifParaformer(Paraformer):
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
	
    def __init__(
        self,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        assert isinstance(self.predictor, CifPredictorV3), "BiCifParaformer should use CIFPredictorV3"


    def _calc_pre2_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        _, _, _, _, pre_token_length2 = self.predictor(encoder_out, ys_pad, encoder_out_mask, ignore_id=self.ignore_id)
		
        # loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
        loss_pre2 = self.criterion_pre(ys_pad_lens.type_as(pre_token_length2), pre_token_length2)
		
        return loss_pre2
	
	
    def _calc_att_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        pre_acoustic_embeds, pre_token_length, _, pre_peak_index, _ = self.predictor(encoder_out, ys_pad,
                                                                                     encoder_out_mask,
                                                                                     ignore_id=self.ignore_id)
		
        # 0. sampler
        decoder_out_1st = None
        if self.sampling_ratio > 0.0:
            sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
                                                           pre_acoustic_embeds)
        else:
            sematic_embeds = pre_acoustic_embeds
		
        # 1. Forward decoder
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens
        )
        decoder_out, _ = decoder_outs[0], decoder_outs[1]
		
        if decoder_out_1st is None:
            decoder_out_1st = decoder_out
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_pad)
        acc_att = th_accuracy(
            decoder_out_1st.view(-1, self.vocab_size),
            ys_pad,
            ignore_label=self.ignore_id,
        )
        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
		
        # Compute cer/wer using attention-decoder
        if self.training or self.error_calculator is None:
            cer_att, wer_att = None, None
        else:
            ys_hat = decoder_out_1st.argmax(dim=-1)
            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
		
        return loss_att, acc_att, cer_att, wer_att, loss_pre


    def calc_predictor(self, encoder_out, encoder_out_lens):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index, pre_token_length2 = self.predictor(encoder_out,
                                                                                                          None,
                                                                                                          encoder_out_mask,
                                                                                                          ignore_id=self.ignore_id)
        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index


    def calc_predictor_timestamp(self, encoder_out, encoder_out_lens, token_num):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        ds_alphas, ds_cif_peak, us_alphas, us_peaks = self.predictor.get_upsample_timestamp(encoder_out,
                                                                                            encoder_out_mask,
                                                                                            token_num)
        return ds_alphas, ds_cif_peak, us_alphas, us_peaks
	
	
    def forward(
        self,
        speech: torch.Tensor,
        speech_lengths: torch.Tensor,
        text: torch.Tensor,
        text_lengths: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Frontend + Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
            speech_lengths = speech_lengths[:, 0]
		
        batch_size = speech.shape[0]
		
        # Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)


        loss_ctc, cer_ctc = None, None
        loss_pre = None
        stats = dict()
		
        # decoder: CTC branch
        if self.ctc_weight != 0.0:
            loss_ctc, cer_ctc = self._calc_ctc_loss(
                encoder_out, encoder_out_lens, text, text_lengths
            )
			
            # Collect CTC branch stats
            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
            stats["cer_ctc"] = cer_ctc


        # decoder: Attention decoder branch
        loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )
		
        loss_pre2 = self._calc_pre2_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )
		
        # 3. CTC-Att loss definition
        if self.ctc_weight == 0.0:
            loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
        else:
            loss = self.ctc_weight * loss_ctc + (
                1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
		
        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
        stats["acc"] = acc_att
        stats["cer"] = cer_att
        stats["wer"] = wer_att
        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
        stats["loss_pre2"] = loss_pre2.detach().cpu()
		
        stats["loss"] = torch.clone(loss.detach())
		
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = int((text_lengths + self.predictor_bias).sum())
		
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
	
    def generate(self,
                 data_in: list,
                 data_lengths: list = None,
                 key: list = None,
                 tokenizer=None,
                 **kwargs,
                 ):
		
        # init beamsearch
        is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
        is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
        if self.beam_search is None and (is_use_lm or is_use_ctc):
            logging.info("enable beam_search")
            self.init_beam_search(**kwargs)
            self.nbest = kwargs.get("nbest", 1)
		
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, date_type=kwargs.get("date_type", "sound"),
                                               frontend=self.frontend)
        time3 = time.perf_counter()
        meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
        meta_data[
            "batch_data_time"] = speech_lengths.sum().item() * self.frontend.frame_shift * self.frontend.lfr_n / 1000
		
        speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])
		
        # Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]
		
        # predictor
        predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens, pre_acoustic_embeds,
                                                       pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
		
        # BiCifParaformer, test no bias cif2

        _, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens,
                                                                                pre_token_length)
		
        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            x = encoder_out[i, :encoder_out_lens[i], :]
            am_scores = decoder_out[i, :pre_token_length[i], :]
            if self.beam_search is not None:
                nbest_hyps = self.beam_search(
                    x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0),
                    minlenratio=kwargs.get("minlenratio", 0.0)
                )
				
                nbest_hyps = nbest_hyps[: self.nbest]
            else:
				
                yseq = am_scores.argmax(dim=-1)
                score = am_scores.max(dim=-1)[0]
                score = torch.sum(score, dim=-1)
                # pad with mask tokens to ensure compatibility with sos/eos tokens
                yseq = torch.tensor(
                    [self.sos] + yseq.tolist() + [self.eos], device=yseq.device
                )
                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
            for nbest_idx, hyp in enumerate(nbest_hyps):
                ibest_writer = None
                if ibest_writer is None and kwargs.get("output_dir") is not None:
                    writer = DatadirWriter(kwargs.get("output_dir"))
                    ibest_writer = writer[f"{nbest_idx + 1}best_recog"]
                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()
				
                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
				
                # Change integer-ids to tokens
                token = tokenizer.ids2tokens(token_int)
                text = tokenizer.tokens2text(token)
				
                _, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:encoder_out_lens[i] * 3],
                                                           us_peaks[i][:encoder_out_lens[i] * 3],
                                                           copy.copy(token),
                                                           vad_offset=kwargs.get("begin_time", 0))
				
                text_postprocessed, time_stamp_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token, timestamp)
				
                result_i = {"key": key[i], "token": token, "text": text, "text_postprocessed": text_postprocessed,
                            "time_stamp_postprocessed": time_stamp_postprocessed,
                            "word_lists": word_lists
                            }
                results.append(result_i)
				
                if ibest_writer is not None:
                    ibest_writer["token"][key[i]] = " ".join(token)
                    ibest_writer["text"][key[i]] = text
                    ibest_writer["text_postprocessed"][key[i]] = text_postprocessed
					
		
        return results, meta_data


class NeatContextualParaformer(Paraformer):
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
	
    def __init__(
        self,
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
		
        self.target_buffer_length = kwargs.get("target_buffer_length", -1)
        inner_dim = kwargs.get("inner_dim", 256)
        bias_encoder_type = kwargs.get("bias_encoder_type", "lstm")
        use_decoder_embedding = kwargs.get("use_decoder_embedding", False)
        crit_attn_weight = kwargs.get("crit_attn_weight", 0.0)
        crit_attn_smooth = kwargs.get("crit_attn_smooth", 0.0)
        bias_encoder_dropout_rate = kwargs.get("bias_encoder_dropout_rate", 0.0)


        if bias_encoder_type == 'lstm':
            logging.warning("enable bias encoder sampling and contextual training")
            self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate)
            self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim)
        elif bias_encoder_type == 'mean':
            logging.warning("enable bias encoder sampling and contextual training")
            self.bias_embed = torch.nn.Embedding(self.vocab_size, inner_dim)
        else:
            logging.error("Unsupport bias encoder type: {}".format(bias_encoder_type))
		
        if self.target_buffer_length > 0:
            self.hotword_buffer = None
            self.length_record = []
            self.current_buffer_length = 0
        self.use_decoder_embedding = use_decoder_embedding
        self.crit_attn_weight = crit_attn_weight
        if self.crit_attn_weight > 0:
            self.attn_loss = torch.nn.L1Loss()
        self.crit_attn_smooth = crit_attn_smooth


    def forward(
        self,
        speech: torch.Tensor,
        speech_lengths: torch.Tensor,
        text: torch.Tensor,
        text_lengths: torch.Tensor,
        hotword_pad: torch.Tensor,
        hotword_lengths: torch.Tensor,
        dha_pad: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Frontend + Encoder + Decoder + Calc loss
	
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
            speech_lengths = speech_lengths[:, 0]
		
        batch_size = speech.shape[0]

        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)

		
        loss_ctc, cer_ctc = None, None
		
        stats = dict()
		
        # 1. CTC branch
        if self.ctc_weight != 0.0:
            loss_ctc, cer_ctc = self._calc_ctc_loss(
                encoder_out, encoder_out_lens, text, text_lengths
            )
			
            # Collect CTC branch stats
            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
            stats["cer_ctc"] = cer_ctc
		

        # 2b. Attention decoder branch
        loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
            encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
        )
		
        # 3. CTC-Att loss definition
        if self.ctc_weight == 0.0:
            loss = loss_att + loss_pre * self.predictor_weight
        else:
            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
		
        if loss_ideal is not None:
            loss = loss + loss_ideal * self.crit_attn_weight
            stats["loss_ideal"] = loss_ideal.detach().cpu()
		
        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
        stats["acc"] = acc_att
        stats["cer"] = cer_att
        stats["wer"] = wer_att
        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
		
        stats["loss"] = torch.clone(loss.detach())
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = int((text_lengths + self.predictor_bias).sum())
		
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
	
	
    def _calc_att_clas_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
        hotword_pad: torch.Tensor,
        hotword_lengths: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
                                                                     ignore_id=self.ignore_id)
		
        # -1. bias encoder
        if self.use_decoder_embedding:
            hw_embed = self.decoder.embed(hotword_pad)
        else:
            hw_embed = self.bias_embed(hotword_pad)
        hw_embed, (_, _) = self.bias_encoder(hw_embed)
        _ind = np.arange(0, hotword_pad.shape[0]).tolist()
        selected = hw_embed[_ind, [i - 1 for i in hotword_lengths.detach().cpu().tolist()]]
        contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
		
        # 0. sampler
        decoder_out_1st = None
        if self.sampling_ratio > 0.0:
            if self.step_cur < 2:
                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
            sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
                                                           pre_acoustic_embeds, contextual_info)
        else:
            if self.step_cur < 2:
                logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
            sematic_embeds = pre_acoustic_embeds
		
        # 1. Forward decoder
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
        )
        decoder_out, _ = decoder_outs[0], decoder_outs[1]
        '''
        if self.crit_attn_weight > 0 and attn.shape[-1] > 1:
            ideal_attn = ideal_attn + self.crit_attn_smooth / (self.crit_attn_smooth + 1.0)
            attn_non_blank = attn[:,:,:,:-1]
            ideal_attn_non_blank = ideal_attn[:,:,:-1]
            loss_ideal = self.attn_loss(attn_non_blank.max(1)[0], ideal_attn_non_blank.to(attn.device))
        else:
            loss_ideal = None
        '''
        loss_ideal = None
		
        if decoder_out_1st is None:
            decoder_out_1st = decoder_out
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_pad)
        acc_att = th_accuracy(
            decoder_out_1st.view(-1, self.vocab_size),
            ys_pad,
            ignore_label=self.ignore_id,
        )
        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
		
        # Compute cer/wer using attention-decoder
        if self.training or self.error_calculator is None:
            cer_att, wer_att = None, None
        else:
            ys_hat = decoder_out_1st.argmax(dim=-1)
            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
		
        return loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal
	
	
    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, contextual_info):
        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
        ys_pad = ys_pad * tgt_mask[:, :, 0]
        if self.share_embedding:
            ys_pad_embed = self.decoder.output_layer.weight[ys_pad]
        else:
            ys_pad_embed = self.decoder.embed(ys_pad)
        with torch.no_grad():
            decoder_outs = self.decoder(
                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, contextual_info=contextual_info
            )
            decoder_out, _ = decoder_outs[0], decoder_outs[1]
            pred_tokens = decoder_out.argmax(-1)
            nonpad_positions = ys_pad.ne(self.ignore_id)
            seq_lens = (nonpad_positions).sum(1)
            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
            input_mask = torch.ones_like(nonpad_positions)
            bsz, seq_len = ys_pad.size()
            for li in range(bsz):
                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
                if target_num > 0:
                    input_mask[li].scatter_(dim=0,
                                            index=torch.randperm(seq_lens[li])[:target_num].to(pre_acoustic_embeds.device),
                                            value=0)
            input_mask = input_mask.eq(1)
            input_mask = input_mask.masked_fill(~nonpad_positions, False)
            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
		
        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
            input_mask_expand_dim, 0)
        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
	
	
    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None,
                                   clas_scale=1.0):
        if hw_list is None:
            hw_list = [torch.Tensor([1]).long().to(encoder_out.device)]  # empty hotword list
            hw_list_pad = pad_list(hw_list, 0)
            if self.use_decoder_embedding:
                hw_embed = self.decoder.embed(hw_list_pad)
            else:
                hw_embed = self.bias_embed(hw_list_pad)
            hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
        else:
            hw_lengths = [len(i) for i in hw_list]
            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
            if self.use_decoder_embedding:
                hw_embed = self.decoder.embed(hw_list_pad)
            else:
                hw_embed = self.bias_embed(hw_list_pad)
            hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
                                                               enforce_sorted=False)
            _, (h_n, _) = self.bias_encoder(hw_embed)
            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
		
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale
        )
        decoder_out = decoder_outs[0]
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out, ys_pad_lens
		
    def generate(self,
                 data_in: list,
                 data_lengths: list = None,
                 key: list = None,
                 tokenizer=None,
                 **kwargs,
                 ):
		
        # init beamsearch
        is_use_ctc = kwargs.get("decoding_ctc_weight", 0.0) > 0.00001 and self.ctc != None
        is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
        if self.beam_search is None and (is_use_lm or is_use_ctc):
            logging.info("enable beam_search")
            self.init_beam_search(**kwargs)
            self.nbest = kwargs.get("nbest", 1)
		
        meta_data = {}
		
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, date_type=kwargs.get("date_type", "sound"),
                                               frontend=self.frontend)
        time3 = time.perf_counter()
        meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
        meta_data[
            "batch_data_time"] = speech_lengths.sum().item() * self.frontend.frame_shift * self.frontend.lfr_n / 1000
		
        speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"])

        # hotword
        self.hotword_list = self.generate_hotwords_list(kwargs.get("hotword", None), tokenizer=tokenizer)
		
        # Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]
		
        # predictor
        predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []


        decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens,
                                                                 pre_acoustic_embeds,
                                                                 pre_token_length,
                                                                 hw_list=self.hotword_list,
                                                                 clas_scale=kwargs.get("clas_scale", 1.0))
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
		
        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            x = encoder_out[i, :encoder_out_lens[i], :]
            am_scores = decoder_out[i, :pre_token_length[i], :]
            if self.beam_search is not None:
                nbest_hyps = self.beam_search(
                    x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0),
                    minlenratio=kwargs.get("minlenratio", 0.0)
                )
				
                nbest_hyps = nbest_hyps[: self.nbest]
            else:
				
                yseq = am_scores.argmax(dim=-1)
                score = am_scores.max(dim=-1)[0]
                score = torch.sum(score, dim=-1)
                # pad with mask tokens to ensure compatibility with sos/eos tokens
                yseq = torch.tensor(
                    [self.sos] + yseq.tolist() + [self.eos], device=yseq.device
                )
                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
            for nbest_idx, hyp in enumerate(nbest_hyps):
                ibest_writer = None
                if ibest_writer is None and kwargs.get("output_dir") is not None:
                    writer = DatadirWriter(kwargs.get("output_dir"))
                    ibest_writer = writer[f"{nbest_idx + 1}best_recog"]
                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()
				
                # remove blank symbol id, which is assumed to be 0
                token_int = list(
                    filter(lambda x: x != self.eos and x != self.sos and x != self.blank_id, token_int))
				
                # Change integer-ids to tokens
                token = tokenizer.ids2tokens(token_int)
                text = tokenizer.tokens2text(token)
				
                text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
                result_i = {"key": key[i], "token": token, "text": text, "text_postprocessed": text_postprocessed}
                results.append(result_i)
				
                if ibest_writer is not None:
                    ibest_writer["token"][key[i]] = " ".join(token)
                    ibest_writer["text"][key[i]] = text
                    ibest_writer["text_postprocessed"][key[i]] = text_postprocessed
		
        return results, meta_data


    def generate_hotwords_list(self, hotword_list_or_file, tokenizer=None):
        def load_seg_dict(seg_dict_file):
            seg_dict = {}
            assert isinstance(seg_dict_file, str)
            with open(seg_dict_file, "r", encoding="utf8") as f:
                lines = f.readlines()
                for line in lines:
                    s = line.strip().split()
                    key = s[0]
                    value = s[1:]
                    seg_dict[key] = " ".join(value)
            return seg_dict
		
        def seg_tokenize(txt, seg_dict):
            pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$')
            out_txt = ""
            for word in txt:
                word = word.lower()
                if word in seg_dict:
                    out_txt += seg_dict[word] + " "
                else:
                    if pattern.match(word):
                        for char in word:
                            if char in seg_dict:
                                out_txt += seg_dict[char] + " "
                            else:
                                out_txt += "<unk>" + " "
                    else:
                        out_txt += "<unk>" + " "
            return out_txt.strip().split()
		
        seg_dict = None
        if self.frontend.cmvn_file is not None:
            model_dir = os.path.dirname(self.frontend.cmvn_file)
            seg_dict_file = os.path.join(model_dir, 'seg_dict')
            if os.path.exists(seg_dict_file):
                seg_dict = load_seg_dict(seg_dict_file)
            else:
                seg_dict = None
        # for None
        if hotword_list_or_file is None:
            hotword_list = None
        # for local txt inputs
        elif os.path.exists(hotword_list_or_file) and hotword_list_or_file.endswith('.txt'):
            logging.info("Attempting to parse hotwords from local txt...")
            hotword_list = []
            hotword_str_list = []
            with codecs.open(hotword_list_or_file, 'r') as fin:
                for line in fin.readlines():
                    hw = line.strip()
                    hw_list = hw.split()
                    if seg_dict is not None:
                        hw_list = seg_tokenize(hw_list, seg_dict)
                    hotword_str_list.append(hw)
                    hotword_list.append(tokenizer.tokens2ids(hw_list))
                hotword_list.append([self.sos])
                hotword_str_list.append('<s>')
            logging.info("Initialized hotword list from file: {}, hotword list: {}."
                         .format(hotword_list_or_file, hotword_str_list))
        # for url, download and generate txt
        elif hotword_list_or_file.startswith('http'):
            logging.info("Attempting to parse hotwords from url...")
            work_dir = tempfile.TemporaryDirectory().name
            if not os.path.exists(work_dir):
                os.makedirs(work_dir)
            text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
            local_file = requests.get(hotword_list_or_file)
            open(text_file_path, "wb").write(local_file.content)
            hotword_list_or_file = text_file_path
            hotword_list = []
            hotword_str_list = []
            with codecs.open(hotword_list_or_file, 'r') as fin:
                for line in fin.readlines():
                    hw = line.strip()
                    hw_list = hw.split()
                    if seg_dict is not None:
                        hw_list = seg_tokenize(hw_list, seg_dict)
                    hotword_str_list.append(hw)
                    hotword_list.append(tokenizer.tokens2ids(hw_list))
                hotword_list.append([self.sos])
                hotword_str_list.append('<s>')
            logging.info("Initialized hotword list from file: {}, hotword list: {}."
                         .format(hotword_list_or_file, hotword_str_list))
        # for text str input
        elif not hotword_list_or_file.endswith('.txt'):
            logging.info("Attempting to parse hotwords as str...")
            hotword_list = []
            hotword_str_list = []
            for hw in hotword_list_or_file.strip().split():
                hotword_str_list.append(hw)
                hw_list = hw.strip().split()
                if seg_dict is not None:
                    hw_list = seg_tokenize(hw_list, seg_dict)
                hotword_list.append(tokenizer.tokens2ids(hw_list))
            hotword_list.append([self.sos])
            hotword_str_list.append('<s>')
            logging.info("Hotword list: {}.".format(hotword_str_list))
        else:
            hotword_list = None
        return hotword_list


class ParaformerOnline(Paraformer):
    """
    Author: Speech Lab of DAMO Academy, Alibaba Group
    Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
    https://arxiv.org/abs/2206.08317
    """
	
    def __init__(
        self,
        *args,
        **kwargs,
    ):
		
        super().__init__(*args, **kwargs)
		
        # import pdb;
        # pdb.set_trace()
        self.sampling_ratio = kwargs.get("sampling_ratio", 0.2)


        self.scama_mask = None
        if hasattr(self.encoder, "overlap_chunk_cls") and self.encoder.overlap_chunk_cls is not None:
            from funasr.modules.streaming_utils.chunk_utilis import build_scama_mask_for_cross_attention_decoder
            self.build_scama_mask_for_cross_attention_decoder_fn = build_scama_mask_for_cross_attention_decoder
            self.decoder_attention_chunk_type = kwargs.get("decoder_attention_chunk_type", "chunk")


	
    def forward(
        self,
        speech: torch.Tensor,
        speech_lengths: torch.Tensor,
        text: torch.Tensor,
        text_lengths: torch.Tensor,
        **kwargs,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Encoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        """
        # import pdb;
        # pdb.set_trace()
        decoding_ind = kwargs.get("decoding_ind")
        if len(text_lengths.size()) > 1:
            text_lengths = text_lengths[:, 0]
        if len(speech_lengths.size()) > 1:
            speech_lengths = speech_lengths[:, 0]
		
        batch_size = speech.shape[0]
		
        # Encoder
        if hasattr(self.encoder, "overlap_chunk_cls"):
            ind = self.encoder.overlap_chunk_cls.random_choice(self.training, decoding_ind)
            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
        else:
            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
		
        loss_ctc, cer_ctc = None, None
        loss_pre = None
        stats = dict()
		
        # decoder: CTC branch

        if self.ctc_weight > 0.0:
            if hasattr(self.encoder, "overlap_chunk_cls"):
                encoder_out_ctc, encoder_out_lens_ctc = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
                                                                                                    encoder_out_lens,
                                                                                                    chunk_outs=None)
            else:
                encoder_out_ctc, encoder_out_lens_ctc = encoder_out, encoder_out_lens
				
            loss_ctc, cer_ctc = self._calc_ctc_loss(
                encoder_out_ctc, encoder_out_lens_ctc, text, text_lengths
            )
            # Collect CTC branch stats
            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
            stats["cer_ctc"] = cer_ctc
		
        # decoder: Attention decoder branch
        loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_predictor_loss(
            encoder_out, encoder_out_lens, text, text_lengths
        )
		
        # 3. CTC-Att loss definition
        if self.ctc_weight == 0.0:
            loss = loss_att + loss_pre * self.predictor_weight
        else:
            loss = self.ctc_weight * loss_ctc + (
                    1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
		
        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
        stats["pre_loss_att"] = pre_loss_att.detach() if pre_loss_att is not None else None
        stats["acc"] = acc_att
        stats["cer"] = cer_att
        stats["wer"] = wer_att
        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
		
        stats["loss"] = torch.clone(loss.detach())
		
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum()
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
	
    def encode_chunk(
        self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None, **kwargs,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Frontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        """
        with autocast(False):
			
            # Data augmentation
            if self.specaug is not None and self.training:
                speech, speech_lengths = self.specaug(speech, speech_lengths)
			
            # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
            if self.normalize is not None:
                speech, speech_lengths = self.normalize(speech, speech_lengths)
		
        # Forward encoder
        encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(speech, speech_lengths, cache=cache["encoder"])
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]
		
        return encoder_out, torch.tensor([encoder_out.size(1)])
	
    def _calc_att_predictor_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        mask_chunk_predictor = None
        if self.encoder.overlap_chunk_cls is not None:
            mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(None,
                                                                                           device=encoder_out.device,
                                                                                           batch_size=encoder_out.size(
                                                                                               0))
            mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(None, device=encoder_out.device,
                                                                                   batch_size=encoder_out.size(0))
            encoder_out = encoder_out * mask_shfit_chunk
        pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(encoder_out,
                                                                              ys_pad,
                                                                              encoder_out_mask,
                                                                              ignore_id=self.ignore_id,
                                                                              mask_chunk_predictor=mask_chunk_predictor,
                                                                              target_label_length=ys_pad_lens,
                                                                              )
        predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
                                                                                             encoder_out_lens)
		
        scama_mask = None
        if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
            encoder_chunk_size = self.encoder.overlap_chunk_cls.chunk_size_pad_shift_cur
            attention_chunk_center_bias = 0
            attention_chunk_size = encoder_chunk_size
            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
                get_mask_shift_att_chunk_decoder(None,
                                                 device=encoder_out.device,
                                                 batch_size=encoder_out.size(0)
                                                 )
            scama_mask = self.build_scama_mask_for_cross_attention_decoder_fn(
                predictor_alignments=predictor_alignments,
                encoder_sequence_length=encoder_out_lens,
                chunk_size=1,
                encoder_chunk_size=encoder_chunk_size,
                attention_chunk_center_bias=attention_chunk_center_bias,
                attention_chunk_size=attention_chunk_size,
                attention_chunk_type=self.decoder_attention_chunk_type,
                step=None,
                predictor_mask_chunk_hopping=mask_chunk_predictor,
                decoder_att_look_back_factor=decoder_att_look_back_factor,
                mask_shift_att_chunk_decoder=mask_shift_att_chunk_decoder,
                target_length=ys_pad_lens,
                is_training=self.training,
            )
        elif self.encoder.overlap_chunk_cls is not None:
            encoder_out, encoder_out_lens = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
                                                                                        encoder_out_lens,
                                                                                        chunk_outs=None)
        # 0. sampler
        decoder_out_1st = None
        pre_loss_att = None
        if self.sampling_ratio > 0.0:
            if self.step_cur < 2:
                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
            if self.use_1st_decoder_loss:
                sematic_embeds, decoder_out_1st, pre_loss_att = \
                    self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad,
                                           ys_pad_lens, pre_acoustic_embeds, scama_mask)
            else:
                sematic_embeds, decoder_out_1st = \
                    self.sampler(encoder_out, encoder_out_lens, ys_pad,
                                 ys_pad_lens, pre_acoustic_embeds, scama_mask)
        else:
            if self.step_cur < 2:
                logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
            sematic_embeds = pre_acoustic_embeds
		
        # 1. Forward decoder
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, scama_mask
        )
        decoder_out, _ = decoder_outs[0], decoder_outs[1]
		
        if decoder_out_1st is None:
            decoder_out_1st = decoder_out
        # 2. Compute attention loss
        loss_att = self.criterion_att(decoder_out, ys_pad)
        acc_att = th_accuracy(
            decoder_out_1st.view(-1, self.vocab_size),
            ys_pad,
            ignore_label=self.ignore_id,
        )
        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
		
        # Compute cer/wer using attention-decoder
        if self.training or self.error_calculator is None:
            cer_att, wer_att = None, None
        else:
            ys_hat = decoder_out_1st.argmax(dim=-1)
            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
		
        return loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att
	
    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, chunk_mask=None):
		
        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
        ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
        if self.share_embedding:
            ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
        else:
            ys_pad_embed = self.decoder.embed(ys_pad_masked)
        with torch.no_grad():
            decoder_outs = self.decoder(
                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, chunk_mask
            )
            decoder_out, _ = decoder_outs[0], decoder_outs[1]
            pred_tokens = decoder_out.argmax(-1)
            nonpad_positions = ys_pad.ne(self.ignore_id)
            seq_lens = (nonpad_positions).sum(1)
            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
            input_mask = torch.ones_like(nonpad_positions)
            bsz, seq_len = ys_pad.size()
            for li in range(bsz):
                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
                if target_num > 0:
                    input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
            input_mask = input_mask.eq(1)
            input_mask = input_mask.masked_fill(~nonpad_positions, False)
            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
		
        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
            input_mask_expand_dim, 0)
        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
	

    def calc_predictor(self, encoder_out, encoder_out_lens):
		
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        mask_chunk_predictor = None
        if self.encoder.overlap_chunk_cls is not None:
            mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(None,
                                                                                           device=encoder_out.device,
                                                                                           batch_size=encoder_out.size(
                                                                                               0))
            mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(None, device=encoder_out.device,
                                                                                   batch_size=encoder_out.size(0))
            encoder_out = encoder_out * mask_shfit_chunk
        pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index = self.predictor(encoder_out,
                                                                                           None,
                                                                                           encoder_out_mask,
                                                                                           ignore_id=self.ignore_id,
                                                                                           mask_chunk_predictor=mask_chunk_predictor,
                                                                                           target_label_length=None,
                                                                                           )
        predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
                                                                                             encoder_out_lens + 1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)
		
        scama_mask = None
        if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
            encoder_chunk_size = self.encoder.overlap_chunk_cls.chunk_size_pad_shift_cur
            attention_chunk_center_bias = 0
            attention_chunk_size = encoder_chunk_size
            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
                get_mask_shift_att_chunk_decoder(None,
                                                 device=encoder_out.device,
                                                 batch_size=encoder_out.size(0)
                                                 )
            scama_mask = self.build_scama_mask_for_cross_attention_decoder_fn(
                predictor_alignments=predictor_alignments,
                encoder_sequence_length=encoder_out_lens,
                chunk_size=1,
                encoder_chunk_size=encoder_chunk_size,
                attention_chunk_center_bias=attention_chunk_center_bias,
                attention_chunk_size=attention_chunk_size,
                attention_chunk_type=self.decoder_attention_chunk_type,
                step=None,
                predictor_mask_chunk_hopping=mask_chunk_predictor,
                decoder_att_look_back_factor=decoder_att_look_back_factor,
                mask_shift_att_chunk_decoder=mask_shift_att_chunk_decoder,
                target_length=None,
                is_training=self.training,
            )
        self.scama_mask = scama_mask
		
        return pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index
	
    def calc_predictor_chunk(self, encoder_out, cache=None):
		
        pre_acoustic_embeds, pre_token_length = \
            self.predictor.forward_chunk(encoder_out, cache["encoder"])
        return pre_acoustic_embeds, pre_token_length
	
    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, self.scama_mask
        )
        decoder_out = decoder_outs[0]
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out, ys_pad_lens
	
    def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
        decoder_outs = self.decoder.forward_chunk(
            encoder_out, sematic_embeds, cache["decoder"]
        )
        decoder_out = decoder_outs
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out

    def generate(self,
                 speech: torch.Tensor,
                 speech_lengths: torch.Tensor,
                 tokenizer=None,
                 **kwargs,
                 ):
		
        is_use_ctc = kwargs.get("ctc_weight", 0.0) > 0.00001 and self.ctc != None
        print(is_use_ctc)
        is_use_lm = kwargs.get("lm_weight", 0.0) > 0.00001 and kwargs.get("lm_file", None) is not None
		
        if self.beam_search is None and (is_use_lm or is_use_ctc):
            logging.info("enable beam_search")
            self.init_beam_search(speech, speech_lengths, **kwargs)
            self.nbest = kwargs.get("nbest", 1)
		
        # Forward Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        if isinstance(encoder_out, tuple):
            encoder_out = encoder_out[0]
		
        # predictor
        predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.cal_decoder_with_predictor(encoder_out, encoder_out_lens, pre_acoustic_embeds,
                                                       pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
		
        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            x = encoder_out[i, :encoder_out_lens[i], :]
            am_scores = decoder_out[i, :pre_token_length[i], :]
            if self.beam_search is not None:
                nbest_hyps = self.beam_search(
                    x=x, am_scores=am_scores, maxlenratio=kwargs.get("maxlenratio", 0.0),
                    minlenratio=kwargs.get("minlenratio", 0.0)
                )
				
                nbest_hyps = nbest_hyps[: self.nbest]
            else:
				
                yseq = am_scores.argmax(dim=-1)
                score = am_scores.max(dim=-1)[0]
                score = torch.sum(score, dim=-1)
                # pad with mask tokens to ensure compatibility with sos/eos tokens
                yseq = torch.tensor(
                    [self.sos] + yseq.tolist() + [self.eos], device=yseq.device
                )
                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
            for hyp in nbest_hyps:
                assert isinstance(hyp, (Hypothesis)), type(hyp)
				
                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()
				
                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
				
                # Change integer-ids to tokens
                token = tokenizer.ids2tokens(token_int)
                text = tokenizer.tokens2text(token)
				
                timestamp = []
				
                results.append((text, token, timestamp))
		
        return results


 funasr/models/paraformer/search.py

New file
@@ -0,0 +1,453 @@
from itertools import chain
import logging
from typing import Any
from typing import Dict
from typing import List
from typing import NamedTuple
from typing import Tuple
from typing import Union

import torch

from funasr.modules.e2e_asr_common import end_detect
from funasr.modules.scorers.scorer_interface import PartialScorerInterface
from funasr.modules.scorers.scorer_interface import ScorerInterface

class Hypothesis(NamedTuple):
    """Hypothesis data type."""

    yseq: torch.Tensor
    score: Union[float, torch.Tensor] = 0
    scores: Dict[str, Union[float, torch.Tensor]] = dict()
    states: Dict[str, Any] = dict()

    def asdict(self) -> dict:
        """Convert data to JSON-friendly dict."""
        return self._replace(
            yseq=self.yseq.tolist(),
            score=float(self.score),
            scores={k: float(v) for k, v in self.scores.items()},
        )._asdict()


class BeamSearchPara(torch.nn.Module):
    """Beam search implementation."""

    def __init__(
        self,
        scorers: Dict[str, ScorerInterface],
        weights: Dict[str, float],
        beam_size: int,
        vocab_size: int,
        sos: int,
        eos: int,
        token_list: List[str] = None,
        pre_beam_ratio: float = 1.5,
        pre_beam_score_key: str = None,
    ):
        """Initialize beam search.

        Args:
            scorers (dict[str, ScorerInterface]): Dict of decoder modules
                e.g., Decoder, CTCPrefixScorer, LM
                The scorer will be ignored if it is `None`
            weights (dict[str, float]): Dict of weights for each scorers
                The scorer will be ignored if its weight is 0
            beam_size (int): The number of hypotheses kept during search
            vocab_size (int): The number of vocabulary
            sos (int): Start of sequence id
            eos (int): End of sequence id
            token_list (list[str]): List of tokens for debug log
            pre_beam_score_key (str): key of scores to perform pre-beam search
            pre_beam_ratio (float): beam size in the pre-beam search
                will be `int(pre_beam_ratio * beam_size)`

        """
        super().__init__()
        # set scorers
        self.weights = weights
        self.scorers = dict()
        self.full_scorers = dict()
        self.part_scorers = dict()
        # this module dict is required for recursive cast
        # `self.to(device, dtype)` in `recog.py`
        self.nn_dict = torch.nn.ModuleDict()
        for k, v in scorers.items():
            w = weights.get(k, 0)
            if w == 0 or v is None:
                continue
            assert isinstance(
                v, ScorerInterface
            ), f"{k} ({type(v)}) does not implement ScorerInterface"
            self.scorers[k] = v
            if isinstance(v, PartialScorerInterface):
                self.part_scorers[k] = v
            else:
                self.full_scorers[k] = v
            if isinstance(v, torch.nn.Module):
                self.nn_dict[k] = v

        # set configurations
        self.sos = sos
        self.eos = eos
        self.token_list = token_list
        self.pre_beam_size = int(pre_beam_ratio * beam_size)
        self.beam_size = beam_size
        self.n_vocab = vocab_size
        if (
            pre_beam_score_key is not None
            and pre_beam_score_key != "full"
            and pre_beam_score_key not in self.full_scorers
        ):
            raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
        self.pre_beam_score_key = pre_beam_score_key
        self.do_pre_beam = (
            self.pre_beam_score_key is not None
            and self.pre_beam_size < self.n_vocab
            and len(self.part_scorers) > 0
        )

    def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
        """Get an initial hypothesis data.

        Args:
            x (torch.Tensor): The encoder output feature

        Returns:
            Hypothesis: The initial hypothesis.

        """
        init_states = dict()
        init_scores = dict()
        for k, d in self.scorers.items():
            init_states[k] = d.init_state(x)
            init_scores[k] = 0.0
        return [
            Hypothesis(
                score=0.0,
                scores=init_scores,
                states=init_states,
                yseq=torch.tensor([self.sos], device=x.device),
            )
        ]

    @staticmethod
    def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
        """Append new token to prefix tokens.

        Args:
            xs (torch.Tensor): The prefix token
            x (int): The new token to append

        Returns:
            torch.Tensor: New tensor contains: xs + [x] with xs.dtype and xs.device

        """
        x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
        return torch.cat((xs, x))

    def score_full(
        self, hyp: Hypothesis, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.full_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.full_scorers`
                and tensor score values of shape: `(self.n_vocab,)`,
                and state dict that has string keys
                and state values of `self.full_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.full_scorers.items():
            scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
        return scores, states

    def score_partial(
        self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
    ) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
        """Score new hypothesis by `self.part_scorers`.

        Args:
            hyp (Hypothesis): Hypothesis with prefix tokens to score
            ids (torch.Tensor): 1D tensor of new partial tokens to score
            x (torch.Tensor): Corresponding input feature

        Returns:
            Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
                score dict of `hyp` that has string keys of `self.part_scorers`
                and tensor score values of shape: `(len(ids),)`,
                and state dict that has string keys
                and state values of `self.part_scorers`

        """
        scores = dict()
        states = dict()
        for k, d in self.part_scorers.items():
            scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
        return scores, states

    def beam(
        self, weighted_scores: torch.Tensor, ids: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Compute topk full token ids and partial token ids.

        Args:
            weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
            Its shape is `(self.n_vocab,)`.
            ids (torch.Tensor): The partial token ids to compute topk

        Returns:
            Tuple[torch.Tensor, torch.Tensor]:
                The topk full token ids and partial token ids.
                Their shapes are `(self.beam_size,)`

        """
        # no pre beam performed
        if weighted_scores.size(0) == ids.size(0):
            top_ids = weighted_scores.topk(self.beam_size)[1]
            return top_ids, top_ids

        # mask pruned in pre-beam not to select in topk
        tmp = weighted_scores[ids]
        weighted_scores[:] = -float("inf")
        weighted_scores[ids] = tmp
        top_ids = weighted_scores.topk(self.beam_size)[1]
        local_ids = weighted_scores[ids].topk(self.beam_size)[1]
        return top_ids, local_ids

    @staticmethod
    def merge_scores(
        prev_scores: Dict[str, float],
        next_full_scores: Dict[str, torch.Tensor],
        full_idx: int,
        next_part_scores: Dict[str, torch.Tensor],
        part_idx: int,
    ) -> Dict[str, torch.Tensor]:
        """Merge scores for new hypothesis.

        Args:
            prev_scores (Dict[str, float]):
                The previous hypothesis scores by `self.scorers`
            next_full_scores (Dict[str, torch.Tensor]): scores by `self.full_scorers`
            full_idx (int): The next token id for `next_full_scores`
            next_part_scores (Dict[str, torch.Tensor]):
                scores of partial tokens by `self.part_scorers`
            part_idx (int): The new token id for `next_part_scores`

        Returns:
            Dict[str, torch.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are scalar tensors by the scorers.

        """
        new_scores = dict()
        for k, v in next_full_scores.items():
            new_scores[k] = prev_scores[k] + v[full_idx]
        for k, v in next_part_scores.items():
            new_scores[k] = prev_scores[k] + v[part_idx]
        return new_scores

    def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
        """Merge states for new hypothesis.

        Args:
            states: states of `self.full_scorers`
            part_states: states of `self.part_scorers`
            part_idx (int): The new token id for `part_scores`

        Returns:
            Dict[str, torch.Tensor]: The new score dict.
                Its keys are names of `self.full_scorers` and `self.part_scorers`.
                Its values are states of the scorers.

        """
        new_states = dict()
        for k, v in states.items():
            new_states[k] = v
        for k, d in self.part_scorers.items():
            new_states[k] = d.select_state(part_states[k], part_idx)
        return new_states

    def search(
        self, running_hyps: List[Hypothesis], x: torch.Tensor, am_score: torch.Tensor
    ) -> List[Hypothesis]:
        """Search new tokens for running hypotheses and encoded speech x.

        Args:
            running_hyps (List[Hypothesis]): Running hypotheses on beam
            x (torch.Tensor): Encoded speech feature (T, D)

        Returns:
            List[Hypotheses]: Best sorted hypotheses

        """
        best_hyps = []
        part_ids = torch.arange(self.n_vocab, device=x.device)  # no pre-beam
        for hyp in running_hyps:
            # scoring
            weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
            weighted_scores += am_score
            scores, states = self.score_full(hyp, x)
            for k in self.full_scorers:
                weighted_scores += self.weights[k] * scores[k]
            # partial scoring
            if self.do_pre_beam:
                pre_beam_scores = (
                    weighted_scores
                    if self.pre_beam_score_key == "full"
                    else scores[self.pre_beam_score_key]
                )
                part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
            part_scores, part_states = self.score_partial(hyp, part_ids, x)
            for k in self.part_scorers:
                weighted_scores[part_ids] += self.weights[k] * part_scores[k]
            # add previous hyp score
            weighted_scores += hyp.score

            # update hyps
            for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
                # will be (2 x beam at most)
                best_hyps.append(
                    Hypothesis(
                        score=weighted_scores[j],
                        yseq=self.append_token(hyp.yseq, j),
                        scores=self.merge_scores(
                            hyp.scores, scores, j, part_scores, part_j
                        ),
                        states=self.merge_states(states, part_states, part_j),
                    )
                )

            # sort and prune 2 x beam -> beam
            best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
                : min(len(best_hyps), self.beam_size)
            ]
        return best_hyps

    def forward(
        self, x: torch.Tensor, am_scores: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
    ) -> List[Hypothesis]:
        """Perform beam search.

        Args:
            x (torch.Tensor): Encoded speech feature (T, D)
            maxlenratio (float): Input length ratio to obtain max output length.
                If maxlenratio=0.0 (default), it uses a end-detect function
                to automatically find maximum hypothesis lengths
                If maxlenratio<0.0, its absolute value is interpreted
                as a constant max output length.
            minlenratio (float): Input length ratio to obtain min output length.

        Returns:
            list[Hypothesis]: N-best decoding results

        """
        # set length bounds
        maxlen = am_scores.shape[0]
        logging.info("decoder input length: " + str(x.shape[0]))
        logging.info("max output length: " + str(maxlen))

        # main loop of prefix search
        running_hyps = self.init_hyp(x)
        ended_hyps = []
        for i in range(maxlen):
            logging.debug("position " + str(i))
            best = self.search(running_hyps, x, am_scores[i])
            # post process of one iteration
            running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
            # end detection
            if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
                logging.info(f"end detected at {i}")
                break
            if len(running_hyps) == 0:
                logging.info("no hypothesis. Finish decoding.")
                break
            else:
                logging.debug(f"remained hypotheses: {len(running_hyps)}")

        nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
        # check the number of hypotheses reaching to eos
        if len(nbest_hyps) == 0:
            logging.warning(
                "there is no N-best results, perform recognition "
                "again with smaller minlenratio."
            )
            return (
                []
                if minlenratio < 0.1
                else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
            )

        # report the best result
        best = nbest_hyps[0]
        for k, v in best.scores.items():
            logging.info(
                f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
            )
        logging.info(f"total log probability: {best.score:.2f}")
        logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
        logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
        if self.token_list is not None:
            logging.info(
                "best hypo: "
                + "".join([self.token_list[x.item()] for x in best.yseq[1:-1]])
                + "\n"
            )
        return nbest_hyps

    def post_process(
        self,
        i: int,
        maxlen: int,
        maxlenratio: float,
        running_hyps: List[Hypothesis],
        ended_hyps: List[Hypothesis],
    ) -> List[Hypothesis]:
        """Perform post-processing of beam search iterations.

        Args:
            i (int): The length of hypothesis tokens.
            maxlen (int): The maximum length of tokens in beam search.
            maxlenratio (int): The maximum length ratio in beam search.
            running_hyps (List[Hypothesis]): The running hypotheses in beam search.
            ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.

        Returns:
            List[Hypothesis]: The new running hypotheses.

        """
        logging.debug(f"the number of running hypotheses: {len(running_hyps)}")
        if self.token_list is not None:
            logging.debug(
                "best hypo: "
                + "".join([self.token_list[x.item()] for x in running_hyps[0].yseq[1:]])
            )
        # add eos in the final loop to avoid that there are no ended hyps
        if i == maxlen - 1:
            logging.info("adding <eos> in the last position in the loop")
            running_hyps = [
                h._replace(yseq=self.append_token(h.yseq, self.eos))
                for h in running_hyps
            ]

        # add ended hypotheses to a final list, and removed them from current hypotheses
        # (this will be a problem, number of hyps < beam)
        remained_hyps = []
        for hyp in running_hyps:
            if hyp.yseq[-1] == self.eos:
                # e.g., Word LM needs to add final <eos> score
                for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
                    s = d.final_score(hyp.states[k])
                    hyp.scores[k] += s
                    hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
                ended_hyps.append(hyp)
            else:
                remained_hyps.append(hyp)
        return remained_hyps

 funasr/tokenizer/abs_tokenizer.py

@@ -66,7 +66,9 @@
        return text_ints
    
    def decode(self, text_ints):
        return self.ids2tokens(text_ints)
        token = self.ids2tokens(text_ints)
        text = self.tokens2text(token)
        return text
    
    def get_num_vocabulary_size(self) -> int:
        return len(self.token_list)

 funasr/utils/download_from_hub.py

@@ -11,10 +11,10 @@
    return kwargs

def download_fr_ms(**kwargs):
    model_or_path = kwargs.get("model_pretrain")
    model_revision = kwargs.get("model_pretrain_revision")
    model_or_path = kwargs.get("model")
    model_revision = kwargs.get("model_revision")
    if not os.path.exists(model_or_path):
        model_or_path = get_or_download_model_dir(model_or_path, model_revision, third_party="funasr")
        model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"))
    
    config = os.path.join(model_or_path, "config.yaml")
    assert os.path.exists(config), "{} is not exist!".format(config)
@@ -23,25 +23,29 @@
    init_param = os.path.join(model_or_path, "model.pb")
    kwargs["init_param"] = init_param
    kwargs["token_list"] = os.path.join(model_or_path, "tokens.txt")
    kwargs["model"] = cfg["model"]
    kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")
    
    return kwargs

def get_or_download_model_dir(
                              model,
                              model_revision=None,
                              third_party=None):
                              is_training=False,
    ):
    """ Get local model directory or download model if necessary.

    Args:
        model (str): model id or path to local model directory.
        model_revision  (str, optional): model version number.
        third_party (str, optional): in which third party library
            this function is called.
        :param is_training:
    """
    from modelscope.hub.check_model import check_local_model_is_latest
    from modelscope.hub.snapshot_download import snapshot_download

    from modelscope.utils.constant import Invoke, ThirdParty
	
    key = Invoke.LOCAL_TRAINER if is_training else Invoke.PIPELINE
    
    if os.path.exists(model):
        model_cache_dir = model if os.path.isdir(
@@ -49,15 +53,15 @@
        check_local_model_is_latest(
            model_cache_dir,
            user_agent={
                Invoke.KEY: Invoke.LOCAL_TRAINER,
                ThirdParty.KEY: third_party
                Invoke.KEY: key,
                ThirdParty.KEY: "funasr"
            })
    else:
        model_cache_dir = snapshot_download(
            model,
            revision=model_revision,
            user_agent={
                Invoke.KEY: Invoke.TRAINER,
                ThirdParty.KEY: third_party
                Invoke.KEY: key,
                ThirdParty.KEY: "funasr"
            })
    return model_cache_dir

			@@ -21,3 +21,4 @@
			modelscope
			samples
			.ipynb_checkpoints
			outputs*

New file
			@@ -0,0 +1,15 @@

			cmd="funasr/bin/inference.py"

			python $cmd \
			+model="/Users/zhifu/modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404" \
			+input="/Users/zhifu/Downloads/asr_example.wav" \
			+output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \
			+device="cpu" \
			+"hotword='达魔院魔搭'"

			#+input="/Users/zhifu/funasr_github/test_local/asr_example.wav" \
			#+input="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len.jsonl" \
			#+model="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \

			#+model="/Users/zhifu/modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \

			@@ -2,7 +2,7 @@
			cmd="funasr/cli/train_cli.py"

			python $cmd \
			+model_pretrain="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
			+model="/Users/zhifu/modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
			+token_list="/Users/zhifu/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/tokens.txt" \
			+train_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len.jsonl" \
			+output_dir="/Users/zhifu/Downloads/ckpt/funasr2/exp2" \

			@@ -7,4 +7,4 @@
			with open(version_file, "r") as f:
			__version__ = f.read().strip()

			from funasr.bin.inference_cli import infer
			from funasr.bin.inference import infer

			@@ -1254,37 +1254,6 @@

			return cache

			#def _prepare_cache(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			# if len(cache) > 0:
			# return cache
			# config = _read_yaml(asr_train_config)
			# enc_output_size = config["encoder_conf"]["output_size"]
			# feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			# cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			# "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			# "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
			# cache["encoder"] = cache_en

			# cache_de = {"decode_fsmn": None}
			# cache["decoder"] = cache_de

			# return cache

			#def _cache_reset(cache: dict = {}, chunk_size=[5, 10, 5], batch_size=1):
			# if len(cache) > 0:
			# config = _read_yaml(asr_train_config)
			# enc_output_size = config["encoder_conf"]["output_size"]
			# feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
			# cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
			# "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
			# "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)),
			# "tail_chunk": False}
			# cache["encoder"] = cache_en

			# cache_de = {"decode_fsmn": None}
			# cache["decoder"] = cache_de

			# return cache

			def _forward(
			data_path_and_name_and_type,

New file
			@@ -0,0 +1,170 @@
			import os.path

			import torch
			import numpy as np
			import hydra
			import json
			from omegaconf import DictConfig, OmegaConf
			from funasr.utils.dynamic_import import dynamic_import
			import logging
			from funasr.utils.download_from_hub import download_model
			from funasr.torch_utils.set_all_random_seed import set_all_random_seed
			from funasr.tokenizer.funtoken import build_tokenizer
			from funasr.datasets.fun_datasets.load_audio_extract_fbank import load_bytes
			from funasr.torch_utils.device_funcs import to_device
			from tqdm import tqdm
			from funasr.torch_utils.load_pretrained_model import load_pretrained_model
			import time
			import random
			import string

			@hydra.main(config_name=None, version_base=None)
			def main_hydra(kwargs: DictConfig):
			assert "model" in kwargs

			pipeline = infer(**kwargs)
			res = pipeline(input=kwargs["input"])
			print(res)

			def infer(**kwargs):

			if ":" not in kwargs["model"]:
			logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
			kwargs = download_model(**kwargs)

			set_all_random_seed(kwargs.get("seed", 0))


			device = kwargs.get("device", "cuda")
			if not torch.cuda.is_available() or kwargs.get("ngpu", 1):
			device = "cpu"
			batch_size = 1
			kwargs["device"] = device

			# build_tokenizer
			tokenizer = build_tokenizer(
			token_type=kwargs.get("token_type", "char"),
			bpemodel=kwargs.get("bpemodel", None),
			delimiter=kwargs.get("delimiter", None),
			space_symbol=kwargs.get("space_symbol", "<space>"),
			non_linguistic_symbols=kwargs.get("non_linguistic_symbols", None),
			g2p_type=kwargs.get("g2p_type", None),
			token_list=kwargs.get("token_list", None),
			unk_symbol=kwargs.get("unk_symbol", "<unk>"),
			)

			import pdb;
			pdb.set_trace()
			# build model
			model_class = dynamic_import(kwargs.get("model"))
			model = model_class(kwargs, kwargs["model_conf"], vocab_size=len(tokenizer.token_list))
			model.eval()
			model.to(device)
			frontend = model.frontend
			kwargs["token_list"] = tokenizer.token_list


			# init_param
			init_param = kwargs.get("init_param", None)
			if init_param is not None:
			logging.info(f"Loading pretrained params from {init_param}")
			load_pretrained_model(
			model=model,
			init_param=init_param,
			ignore_init_mismatch=kwargs.get("ignore_init_mismatch", False),
			oss_bucket=kwargs.get("oss_bucket", None),
			)

			def _forward(input, input_len=None, **cfg):
			cfg = OmegaConf.merge(kwargs, cfg)
			date_type = cfg.get("date_type", "sound")

			key_list, data_list = build_iter_for_infer(input, input_len=input_len, date_type=date_type, frontend=frontend)

			speed_stats = {}
			asr_result_list = []
			num_samples = len(data_list)
			pbar = tqdm(colour="blue", total=num_samples, dynamic_ncols=True)
			for beg_idx in range(0, num_samples, batch_size):

			end_idx = min(num_samples, beg_idx + batch_size)
			data_batch = data_list[beg_idx:end_idx]
			key_batch = key_list[beg_idx:end_idx]
			batch = {"data_in": data_batch, "key": key_batch}

			time1 = time.perf_counter()
			results, meta_data = model.generate(batch, tokenizer=tokenizer, cfg)
			time2 = time.perf_counter()

			asr_result_list.append(results)
			pbar.update(1)

			# batch_data_time = time_per_frame_s * data_batch_i["speech_lengths"].sum().item()
			batch_data_time = meta_data.get("batch_data_time", -1)
			speed_stats["load_data"] = meta_data["load_data"]
			speed_stats["extract_feat"] = meta_data["extract_feat"]
			speed_stats["forward"] = f"{time2 - time1:0.3f}"
			speed_stats["rtf"] = f"{(time2 - time1)/batch_data_time:0.3f}"
			description = (
			f"{speed_stats}, "
			)
			pbar.set_description(description)

			torch.cuda.empty_cache()
			return asr_result_list

			return _forward


			def build_iter_for_infer(data_in, input_len=None, date_type="sound", frontend=None):
			"""

			:param input:
			:param input_len:
			:param date_type:
			:param frontend:
			:return:
			"""
			data_list = []
			key_list = []
			filelist = [".scp", ".txt", ".json", ".jsonl"]

			chars = string.ascii_letters + string.digits

			if isinstance(data_in, str) and os.path.exists(data_in): # wav_pat; filelist: wav.scp, file.jsonl;text.txt;
			_, file_extension = os.path.splitext(data_in)
			file_extension = file_extension.lower()
			if file_extension in filelist: #filelist: wav.scp, file.jsonl;text.txt;
			with open(data_in, encoding='utf-8') as fin:
			for line in fin:
			key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
			if data_in.endswith(".jsonl"): #file.jsonl: json.dumps({"source": data})
			lines = json.loads(line.strip())
			data = lines["source"]
			key = data["key"] if "key" in data else key
			else: # filelist, wav.scp, text.txt: id \t data or data
			lines = line.strip().split()
			data = lines[1] if len(lines)>1 else lines[0]
			key = lines[0] if len(lines)>1 else key

			data_list.append(data)
			key_list.append(key)
			else:
			key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
			data_list = [data_in]
			key_list = [key]
			elif isinstance(data_in, (list, tuple)): # [audio sample point, fbank, wav_path]
			data_list = data_in
			key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))]
			else: # raw text; audio sample point, fbank
			if isinstance(data_in, bytes): # audio bytes
			data_in = load_bytes(data_in)
			key = "rand_key_" + ''.join(random.choice(chars) for _ in range(13))
			data_list = [data_in]
			key_list = [key]

			return key_list, data_list


			if __name__ == '__main__':
			main_hydra()

			@@ -35,8 +35,9 @@
			@hydra.main(config_name=None, version_base=None)
			def main_hydra(kwargs: DictConfig):
			import pdb; pdb.set_trace()
			if kwargs.get("model_pretrain"):
			kwargs = download_model(**kwargs)
			if ":" in kwargs["model"]:
			logging.info("download models from model hub: {}".format(kwargs.get("model_hub", "ms")))
			kwargs = download_model(is_training=kwargs.get("is_training", True), **kwargs)

			import pdb;
			pdb.set_trace()
			@@ -84,8 +85,7 @@
			# init_param
			init_param = kwargs.get("init_param", None)
			if init_param is not None:
			init_param = init_param
			if isinstance(init_param, Sequence):
			if not isinstance(init_param, Sequence):
			init_param = (init_param,)
			logging.info("init_param is not None: %s", init_param)
			for p in init_param:

			@@ -8,33 +8,7 @@
			import time
			import logging

			def load_audio(audio_path: str, fs: int=16000):
			audio = None
			if audio_path.startswith("oss:"):
			pass
			elif audio_path.startswith("odps:"):
			pass
			else:
			if ".ark:" in audio_path:
			audio = kaldiio.load_mat(audio_path)
			else:
			# audio, fs = librosa.load(audio_path, sr=fs)
			audio, fs = torchaudio.load(audio_path)
			audio = audio[0, :]
			return audio

			def extract_features(data, date_type: str="sound", frontend=None):
			if date_type == "sound":

			if isinstance(data, np.ndarray):
			data = torch.from_numpy(data).to(torch.float32)
			data_len = torch.tensor([data.shape[0]]).to(torch.int32)
			feat, feats_lens = frontend(data[None, :], data_len)

			feat = feat[0, :, :]
			else:
			feat, feats_lens = torch.from_numpy(data).to(torch.float32), torch.tensor([data.shape[0]]).to(torch.int32)
			return feat, feats_lens
			from funasr.datasets.fun_datasets.load_audio_extract_fbank import load_audio, extract_fbank



			@@ -115,17 +89,16 @@

			def __getitem__(self, index):
			item = self.indexed_dataset[index]
			# return item

			source = item["source"]
			data_src = load_audio(source, fs=self.fs)
			speech, speech_lengths = extract_features(data_src, self.data_type, self.frontend)
			speech, speech_lengths = extract_fbank(data_src, self.data_type, self.frontend) # speech: [b, T, d]
			target = item["target"]
			ids = self.tokenizer.encode(target)
			ids_lengths = len(ids)
			text, text_lengths = torch.tensor(ids, dtype=torch.int64), torch.tensor([ids_lengths], dtype=torch.int32)

			return {"speech": speech,
			return {"speech": speech[0, :, :],
			"speech_lengths": speech_lengths,
			"text": text,
			"text_lengths": text_lengths,

New file
			@@ -0,0 +1,75 @@
			import os
			import torch
			import json
			import torch.distributed as dist
			import numpy as np
			import kaldiio
			import librosa
			import torchaudio
			import time
			import logging
			from torch.nn.utils.rnn import pad_sequence

			def load_audio(audio_or_path_or_list, fs: int=16000, audio_fs: int=16000):

			if isinstance(audio_or_path_or_list, (list, tuple)):
			return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]

			if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
			audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)
			audio_or_path_or_list = audio_or_path_or_list[0, :]
			elif isinstance(audio_or_path_or_list, np.ndarray): # audio sample point
			audio_or_path_or_list = np.squeeze(audio_or_path_or_list) #[n_samples,]

			if audio_fs != fs:
			resampler = torchaudio.transforms.Resample(audio_fs, fs)
			resampled_waveform = resampler(audio_or_path_or_list[None, :])[0, :]
			return audio_or_path_or_list
			#
			# def load_audio_from_list(audio_list, fs: int=16000, audio_fs: int=16000):
			# if isinstance(audio_list, (list, tuple)):
			# return [load_audio(audio_or_path, fs=fs, audio_fs=audio_fs) for audio_or_path in audio_list]


			def load_bytes(input):
			middle_data = np.frombuffer(input, dtype=np.int16)
			middle_data = np.asarray(middle_data)
			if middle_data.dtype.kind not in 'iu':
			raise TypeError("'middle_data' must be an array of integers")
			dtype = np.dtype('float32')
			if dtype.kind != 'f':
			raise TypeError("'dtype' must be a floating point type")

			i = np.iinfo(middle_data.dtype)
			abs_max = 2 ** (i.bits - 1)
			offset = i.min + abs_max
			array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
			return array

			def extract_fbank(data, data_len = None, date_type: str="sound", frontend=None):

			if isinstance(data, np.ndarray):
			data = torch.from_numpy(data)
			if len(data) < 2:
			data = data[None, :] # data: [batch, N]
			data_len = [data.shape[1]] if data_len is None else data_len
			elif isinstance(data, torch.Tensor):
			if len(data) < 2:
			data = data[None, :] # data: [batch, N]
			data_len = [data.shape[1]] if data_len is None else data_len
			elif isinstance(data, (list, tuple)):
			data_list, data_len = [], []
			for data_i in data:
			if isinstance(data, np.ndarray):
			data_i = torch.from_numpy(data_i)
			data_list.append(data_i)
			data_len.append(data_i.shape[0])
			data = pad_sequence(data_list, batch_first=True) # data: [batch, N]
			# import pdb;
			# pdb.set_trace()
			if date_type == "sound":
			data, data_len = frontend(data, data_len)

			if isinstance(data_len, (list, tuple)):
			data_len = torch.tensor([data_len])
			return data.to(torch.float32), data_len.to(torch.int32)

			@@ -116,7 +116,7 @@
			def forward(
			self,
			input: torch.Tensor,
			input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
			input_lengths) -> Tuple[torch.Tensor, torch.Tensor]:
			batch_size = input.size(0)
			feats = []
			feats_lens = []

New file
			@@ -0,0 +1,453 @@
			from itertools import chain
			import logging
			from typing import Any
			from typing import Dict
			from typing import List
			from typing import NamedTuple
			from typing import Tuple
			from typing import Union

			import torch

			from funasr.modules.e2e_asr_common import end_detect
			from funasr.modules.scorers.scorer_interface import PartialScorerInterface
			from funasr.modules.scorers.scorer_interface import ScorerInterface

			class Hypothesis(NamedTuple):
			"""Hypothesis data type."""

			yseq: torch.Tensor
			score: Union[float, torch.Tensor] = 0
			scores: Dict[str, Union[float, torch.Tensor]] = dict()
			states: Dict[str, Any] = dict()

			def asdict(self) -> dict:
			"""Convert data to JSON-friendly dict."""
			return self._replace(
			yseq=self.yseq.tolist(),
			score=float(self.score),
			scores={k: float(v) for k, v in self.scores.items()},
			)._asdict()


			class BeamSearchPara(torch.nn.Module):
			"""Beam search implementation."""

			def __init__(
			self,
			scorers: Dict[str, ScorerInterface],
			weights: Dict[str, float],
			beam_size: int,
			vocab_size: int,
			sos: int,
			eos: int,
			token_list: List[str] = None,
			pre_beam_ratio: float = 1.5,
			pre_beam_score_key: str = None,
			):
			"""Initialize beam search.

			Args:
			scorers (dict[str, ScorerInterface]): Dict of decoder modules
			e.g., Decoder, CTCPrefixScorer, LM
			The scorer will be ignored if it is `None`
			weights (dict[str, float]): Dict of weights for each scorers
			The scorer will be ignored if its weight is 0
			beam_size (int): The number of hypotheses kept during search
			vocab_size (int): The number of vocabulary
			sos (int): Start of sequence id
			eos (int): End of sequence id
			token_list (list[str]): List of tokens for debug log
			pre_beam_score_key (str): key of scores to perform pre-beam search
			pre_beam_ratio (float): beam size in the pre-beam search
			will be `int(pre_beam_ratio * beam_size)`

			"""
			super().__init__()
			# set scorers
			self.weights = weights
			self.scorers = dict()
			self.full_scorers = dict()
			self.part_scorers = dict()
			# this module dict is required for recursive cast
			# `self.to(device, dtype)` in `recog.py`
			self.nn_dict = torch.nn.ModuleDict()
			for k, v in scorers.items():
			w = weights.get(k, 0)
			if w == 0 or v is None:
			continue
			assert isinstance(
			v, ScorerInterface
			), f"{k} ({type(v)}) does not implement ScorerInterface"
			self.scorers[k] = v
			if isinstance(v, PartialScorerInterface):
			self.part_scorers[k] = v
			else:
			self.full_scorers[k] = v
			if isinstance(v, torch.nn.Module):
			self.nn_dict[k] = v

			# set configurations
			self.sos = sos
			self.eos = eos
			self.token_list = token_list
			self.pre_beam_size = int(pre_beam_ratio * beam_size)
			self.beam_size = beam_size
			self.n_vocab = vocab_size
			if (
			pre_beam_score_key is not None
			and pre_beam_score_key != "full"
			and pre_beam_score_key not in self.full_scorers
			):
			raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
			self.pre_beam_score_key = pre_beam_score_key
			self.do_pre_beam = (
			self.pre_beam_score_key is not None
			and self.pre_beam_size < self.n_vocab
			and len(self.part_scorers) > 0
			)

			def init_hyp(self, x: torch.Tensor) -> List[Hypothesis]:
			"""Get an initial hypothesis data.

			Args:
			x (torch.Tensor): The encoder output feature

			Returns:
			Hypothesis: The initial hypothesis.

			"""
			init_states = dict()
			init_scores = dict()
			for k, d in self.scorers.items():
			init_states[k] = d.init_state(x)
			init_scores[k] = 0.0
			return [
			Hypothesis(
			score=0.0,
			scores=init_scores,
			states=init_states,
			yseq=torch.tensor([self.sos], device=x.device),
			)
			]

			@staticmethod
			def append_token(xs: torch.Tensor, x: int) -> torch.Tensor:
			"""Append new token to prefix tokens.

			Args:
			xs (torch.Tensor): The prefix token
			x (int): The new token to append

			Returns:
			torch.Tensor: New tensor contains: xs + [x] with xs.dtype and xs.device

			"""
			x = torch.tensor([x], dtype=xs.dtype, device=xs.device)
			return torch.cat((xs, x))

			def score_full(
			self, hyp: Hypothesis, x: torch.Tensor
			) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
			"""Score new hypothesis by `self.full_scorers`.

			Args:
			hyp (Hypothesis): Hypothesis with prefix tokens to score
			x (torch.Tensor): Corresponding input feature

			Returns:
			Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
			score dict of `hyp` that has string keys of `self.full_scorers`
			and tensor score values of shape: `(self.n_vocab,)`,
			and state dict that has string keys
			and state values of `self.full_scorers`

			"""
			scores = dict()
			states = dict()
			for k, d in self.full_scorers.items():
			scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
			return scores, states

			def score_partial(
			self, hyp: Hypothesis, ids: torch.Tensor, x: torch.Tensor
			) -> Tuple[Dict[str, torch.Tensor], Dict[str, Any]]:
			"""Score new hypothesis by `self.part_scorers`.

			Args:
			hyp (Hypothesis): Hypothesis with prefix tokens to score
			ids (torch.Tensor): 1D tensor of new partial tokens to score
			x (torch.Tensor): Corresponding input feature

			Returns:
			Tuple[Dict[str, torch.Tensor], Dict[str, Any]]: Tuple of
			score dict of `hyp` that has string keys of `self.part_scorers`
			and tensor score values of shape: `(len(ids),)`,
			and state dict that has string keys
			and state values of `self.part_scorers`

			"""
			scores = dict()
			states = dict()
			for k, d in self.part_scorers.items():
			scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
			return scores, states

			def beam(
			self, weighted_scores: torch.Tensor, ids: torch.Tensor
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Compute topk full token ids and partial token ids.

			Args:
			weighted_scores (torch.Tensor): The weighted sum scores for each tokens.
			Its shape is `(self.n_vocab,)`.
			ids (torch.Tensor): The partial token ids to compute topk

			Returns:
			Tuple[torch.Tensor, torch.Tensor]:
			The topk full token ids and partial token ids.
			Their shapes are `(self.beam_size,)`

			"""
			# no pre beam performed
			if weighted_scores.size(0) == ids.size(0):
			top_ids = weighted_scores.topk(self.beam_size)[1]
			return top_ids, top_ids

			# mask pruned in pre-beam not to select in topk
			tmp = weighted_scores[ids]
			weighted_scores[:] = -float("inf")
			weighted_scores[ids] = tmp
			top_ids = weighted_scores.topk(self.beam_size)[1]
			local_ids = weighted_scores[ids].topk(self.beam_size)[1]
			return top_ids, local_ids

			@staticmethod
			def merge_scores(
			prev_scores: Dict[str, float],
			next_full_scores: Dict[str, torch.Tensor],
			full_idx: int,
			next_part_scores: Dict[str, torch.Tensor],
			part_idx: int,
			) -> Dict[str, torch.Tensor]:
			"""Merge scores for new hypothesis.

			Args:
			prev_scores (Dict[str, float]):
			The previous hypothesis scores by `self.scorers`
			next_full_scores (Dict[str, torch.Tensor]): scores by `self.full_scorers`
			full_idx (int): The next token id for `next_full_scores`
			next_part_scores (Dict[str, torch.Tensor]):
			scores of partial tokens by `self.part_scorers`
			part_idx (int): The new token id for `next_part_scores`

			Returns:
			Dict[str, torch.Tensor]: The new score dict.
			Its keys are names of `self.full_scorers` and `self.part_scorers`.
			Its values are scalar tensors by the scorers.

			"""
			new_scores = dict()
			for k, v in next_full_scores.items():
			new_scores[k] = prev_scores[k] + v[full_idx]
			for k, v in next_part_scores.items():
			new_scores[k] = prev_scores[k] + v[part_idx]
			return new_scores

			def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
			"""Merge states for new hypothesis.

			Args:
			states: states of `self.full_scorers`
			part_states: states of `self.part_scorers`
			part_idx (int): The new token id for `part_scores`

			Returns:
			Dict[str, torch.Tensor]: The new score dict.
			Its keys are names of `self.full_scorers` and `self.part_scorers`.
			Its values are states of the scorers.

			"""
			new_states = dict()
			for k, v in states.items():
			new_states[k] = v
			for k, d in self.part_scorers.items():
			new_states[k] = d.select_state(part_states[k], part_idx)
			return new_states

			def search(
			self, running_hyps: List[Hypothesis], x: torch.Tensor, am_score: torch.Tensor
			) -> List[Hypothesis]:
			"""Search new tokens for running hypotheses and encoded speech x.

			Args:
			running_hyps (List[Hypothesis]): Running hypotheses on beam
			x (torch.Tensor): Encoded speech feature (T, D)

			Returns:
			List[Hypotheses]: Best sorted hypotheses

			"""
			best_hyps = []
			part_ids = torch.arange(self.n_vocab, device=x.device) # no pre-beam
			for hyp in running_hyps:
			# scoring
			weighted_scores = torch.zeros(self.n_vocab, dtype=x.dtype, device=x.device)
			weighted_scores += am_score
			scores, states = self.score_full(hyp, x)
			for k in self.full_scorers:
			weighted_scores += self.weights[k] * scores[k]
			# partial scoring
			if self.do_pre_beam:
			pre_beam_scores = (
			weighted_scores
			if self.pre_beam_score_key == "full"
			else scores[self.pre_beam_score_key]
			)
			part_ids = torch.topk(pre_beam_scores, self.pre_beam_size)[1]
			part_scores, part_states = self.score_partial(hyp, part_ids, x)
			for k in self.part_scorers:
			weighted_scores[part_ids] += self.weights[k] * part_scores[k]
			# add previous hyp score
			weighted_scores += hyp.score

			# update hyps
			for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
			# will be (2 x beam at most)
			best_hyps.append(
			Hypothesis(
			score=weighted_scores[j],
			yseq=self.append_token(hyp.yseq, j),
			scores=self.merge_scores(
			hyp.scores, scores, j, part_scores, part_j
			),
			states=self.merge_states(states, part_states, part_j),
			)
			)

			# sort and prune 2 x beam -> beam
			best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
			: min(len(best_hyps), self.beam_size)
			]
			return best_hyps

			def forward(
			self, x: torch.Tensor, am_scores: torch.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
			) -> List[Hypothesis]:
			"""Perform beam search.

			Args:
			x (torch.Tensor): Encoded speech feature (T, D)
			maxlenratio (float): Input length ratio to obtain max output length.
			If maxlenratio=0.0 (default), it uses a end-detect function
			to automatically find maximum hypothesis lengths
			If maxlenratio<0.0, its absolute value is interpreted
			as a constant max output length.
			minlenratio (float): Input length ratio to obtain min output length.

			Returns:
			list[Hypothesis]: N-best decoding results

			"""
			# set length bounds
			maxlen = am_scores.shape[0]
			logging.info("decoder input length: " + str(x.shape[0]))
			logging.info("max output length: " + str(maxlen))

			# main loop of prefix search
			running_hyps = self.init_hyp(x)
			ended_hyps = []
			for i in range(maxlen):
			logging.debug("position " + str(i))
			best = self.search(running_hyps, x, am_scores[i])
			# post process of one iteration
			running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
			# end detection
			if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
			logging.info(f"end detected at {i}")
			break
			if len(running_hyps) == 0:
			logging.info("no hypothesis. Finish decoding.")
			break
			else:
			logging.debug(f"remained hypotheses: {len(running_hyps)}")

			nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
			# check the number of hypotheses reaching to eos
			if len(nbest_hyps) == 0:
			logging.warning(
			"there is no N-best results, perform recognition "
			"again with smaller minlenratio."
			)
			return (
			[]
			if minlenratio < 0.1
			else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
			)

			# report the best result
			best = nbest_hyps[0]
			for k, v in best.scores.items():
			logging.info(
			f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
			)
			logging.info(f"total log probability: {best.score:.2f}")
			logging.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
			logging.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
			if self.token_list is not None:
			logging.info(
			"best hypo: "
			+ "".join([self.token_list[x.item()] for x in best.yseq[1:-1]])
			+ "\n"
			)
			return nbest_hyps

			def post_process(
			self,
			i: int,
			maxlen: int,
			maxlenratio: float,
			running_hyps: List[Hypothesis],
			ended_hyps: List[Hypothesis],
			) -> List[Hypothesis]:
			"""Perform post-processing of beam search iterations.

			Args:
			i (int): The length of hypothesis tokens.
			maxlen (int): The maximum length of tokens in beam search.
			maxlenratio (int): The maximum length ratio in beam search.
			running_hyps (List[Hypothesis]): The running hypotheses in beam search.
			ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.

			Returns:
			List[Hypothesis]: The new running hypotheses.

			"""
			logging.debug(f"the number of running hypotheses: {len(running_hyps)}")
			if self.token_list is not None:
			logging.debug(
			"best hypo: "
			+ "".join([self.token_list[x.item()] for x in running_hyps[0].yseq[1:]])
			)
			# add eos in the final loop to avoid that there are no ended hyps
			if i == maxlen - 1:
			logging.info("adding <eos> in the last position in the loop")
			running_hyps = [
			h._replace(yseq=self.append_token(h.yseq, self.eos))
			for h in running_hyps
			]

			# add ended hypotheses to a final list, and removed them from current hypotheses
			# (this will be a problem, number of hyps < beam)
			remained_hyps = []
			for hyp in running_hyps:
			if hyp.yseq[-1] == self.eos:
			# e.g., Word LM needs to add final <eos> score
			for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
			s = d.final_score(hyp.states[k])
			hyp.scores[k] += s
			hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
			ended_hyps.append(hyp)
			else:
			remained_hyps.append(hyp)
			return remained_hyps

			@@ -66,7 +66,9 @@
			return text_ints

			def decode(self, text_ints):
			return self.ids2tokens(text_ints)
			token = self.ids2tokens(text_ints)
			text = self.tokens2text(token)
			return text

			def get_num_vocabulary_size(self) -> int:
			return len(self.token_list)

			@@ -11,10 +11,10 @@
			return kwargs

			def download_fr_ms(**kwargs):
			model_or_path = kwargs.get("model_pretrain")
			model_revision = kwargs.get("model_pretrain_revision")
			model_or_path = kwargs.get("model")
			model_revision = kwargs.get("model_revision")
			if not os.path.exists(model_or_path):
			model_or_path = get_or_download_model_dir(model_or_path, model_revision, third_party="funasr")
			model_or_path = get_or_download_model_dir(model_or_path, model_revision, is_training=kwargs.get("is_training"))

			config = os.path.join(model_or_path, "config.yaml")
			assert os.path.exists(config), "{} is not exist!".format(config)
			@@ -23,25 +23,29 @@
			init_param = os.path.join(model_or_path, "model.pb")
			kwargs["init_param"] = init_param
			kwargs["token_list"] = os.path.join(model_or_path, "tokens.txt")
			kwargs["model"] = cfg["model"]
			kwargs["frontend_conf"]["cmvn_file"] = os.path.join(model_or_path, "am.mvn")

			return kwargs

			def get_or_download_model_dir(
			model,
			model_revision=None,
			third_party=None):
			is_training=False,
			):
			""" Get local model directory or download model if necessary.

			Args:
			model (str): model id or path to local model directory.
			model_revision (str, optional): model version number.
			third_party (str, optional): in which third party library
			this function is called.
			:param is_training:
			"""
			from modelscope.hub.check_model import check_local_model_is_latest
			from modelscope.hub.snapshot_download import snapshot_download

			from modelscope.utils.constant import Invoke, ThirdParty

			key = Invoke.LOCAL_TRAINER if is_training else Invoke.PIPELINE

			if os.path.exists(model):
			model_cache_dir = model if os.path.isdir(
			@@ -49,15 +53,15 @@
			check_local_model_is_latest(
			model_cache_dir,
			user_agent={
			Invoke.KEY: Invoke.LOCAL_TRAINER,
			ThirdParty.KEY: third_party
			Invoke.KEY: key,
			ThirdParty.KEY: "funasr"
			})
			else:
			model_cache_dir = snapshot_download(
			model,
			revision=model_revision,
			user_agent={
			Invoke.KEY: Invoke.TRAINER,
			ThirdParty.KEY: third_party
			Invoke.KEY: key,
			ThirdParty.KEY: "funasr"
			})
			return model_cache_dir