python/FunASR-XL.git

			@@ -1,66 +1,48 @@
			# -- encoding: utf-8 --
			#!/usr/bin/env python3
			# -- encoding: utf-8 --
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)

			import argparse
			import logging
			import sys
			import time

			import codecs
			import copy
			import logging
			import os
			import re
			import codecs
			import tempfile
			import requests
			from pathlib import Path
			from typing import Any
			from typing import Dict
			from typing import List
			from typing import Optional
			from typing import Sequence
			from typing import Tuple
			from typing import Union
			from typing import Dict
			from typing import Any
			from typing import List

			import numpy as np
			import requests
			import torch
			from packaging.version import parse as V
			from typeguard import check_argument_types
			from typeguard import check_return_type
			from funasr.fileio.datadir_writer import DatadirWriter

			from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
			from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
			from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
			from funasr.modules.beam_search.beam_search import BeamSearch
			# from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
			from funasr.modules.beam_search.beam_search import Hypothesis
			from funasr.modules.beam_search.beam_search_sa_asr import Hypothesis as HypothesisSAASR
			from funasr.modules.beam_search.beam_search_transducer import BeamSearchTransducer
			from funasr.modules.beam_search.beam_search_transducer import Hypothesis as HypothesisTransducer
			from funasr.modules.beam_search.beam_search_sa_asr import Hypothesis as HypothesisSAASR
			from funasr.modules.scorers.ctc import CTCPrefixScorer
			from funasr.modules.scorers.length_bonus import LengthBonus
			from funasr.modules.subsampling import TooShortUttError
			from funasr.tasks.asr import ASRTask
			from funasr.tasks.asr import frontend_choices
			from funasr.tasks.lm import LMTask
			from funasr.text.build_tokenizer import build_tokenizer
			from funasr.text.token_id_converter import TokenIDConverter
			from funasr.torch_utils.device_funcs import to_device
			from funasr.torch_utils.set_all_random_seed import set_all_random_seed
			from funasr.utils import config_argparse
			from funasr.utils.cli_utils import get_commandline_args
			from funasr.utils.types import str2bool
			from funasr.utils.types import str2triple_str
			from funasr.utils.types import str_or_none
			from funasr.utils import asr_utils, wav_utils, postprocess_utils
			from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
			from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
			from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
			from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			from funasr.bin.tp_infer import Speech2Timestamp
			from funasr.bin.vad_infer import Speech2VadSegment
			from funasr.bin.punc_infer import Text2Punc
			from funasr.utils.vad_utils import slice_padding_fbank
			from funasr.tasks.vad import VADTask
			from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
			from funasr.tasks.asr import frontend_choices


			class Speech2Text:
			"""Speech2Text class
			@@ -272,6 +254,7 @@
			assert check_return_type(results)
			return results


			class Speech2TextParaformer:
			"""Speech2Text class

			@@ -466,13 +449,16 @@
			pre_token_length = pre_token_length.round().long()
			if torch.max(pre_token_length) < 1:
			return []
			if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model, NeatContextualParaformer):
			if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model,
			NeatContextualParaformer):
			if self.hotword_list:
			logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds,
			pre_token_length)
			decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
			else:
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds,
			pre_token_length, hw_list=self.hotword_list)
			decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]

			if isinstance(self.asr_model, BiCifParaformer):
			@@ -533,7 +519,6 @@
			vad_offset=begin_time)
			results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))


			# assert check_return_type(results)
			return results

			@@ -590,6 +575,7 @@
			else:
			hotword_list = None
			return hotword_list


			class Speech2TextParaformerOnline:
			"""Speech2Text class
			@@ -845,6 +831,7 @@
			# assert check_return_type(results)
			return results


			class Speech2TextUniASR:
			"""Speech2Text class