python/FunASR-XL.git

			@@ -1,17 +1,171 @@
			#!/usr/bin/env python3
			# -- encoding: utf-8 --
			# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
			# MIT License (https://opensource.org/licenses/MIT)


			import argparse
			import logging
			import os
			import sys
			from typing import Union, Dict, Any
			from typing import Optional
			from typing import Union

			import numpy as np
			import torch
			from typeguard import check_argument_types

			from funasr.bin.tp_infer import Speech2Timestamp
			from funasr.build_utils.build_streaming_iterator import build_streaming_iterator
			from funasr.datasets.preprocessor import LMPreprocessor
			from funasr.fileio.datadir_writer import DatadirWriter
			from funasr.torch_utils.set_all_random_seed import set_all_random_seed
			from funasr.utils import config_argparse
			from funasr.utils.cli_utils import get_commandline_args
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			from funasr.utils.types import str2bool
			from funasr.utils.types import str2triple_str
			from funasr.utils.types import str_or_none


			def inference_tp(
			batch_size: int,
			ngpu: int,
			log_level: Union[int, str],
			# data_path_and_name_and_type,
			timestamp_infer_config: Optional[str],
			timestamp_model_file: Optional[str],
			timestamp_cmvn_file: Optional[str] = None,
			# raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			key_file: Optional[str] = None,
			allow_variable_data_keys: bool = False,
			output_dir: Optional[str] = None,
			dtype: str = "float32",
			seed: int = 0,
			num_workers: int = 1,
			split_with_space: bool = True,
			seg_dict_file: Optional[str] = None,
			**kwargs,
			):
			assert check_argument_types()
			ncpu = kwargs.get("ncpu", 1)
			torch.set_num_threads(ncpu)

			if batch_size > 1:
			raise NotImplementedError("batch decoding is not implemented")
			if ngpu > 1:
			raise NotImplementedError("only single GPU decoding is supported")

			logging.basicConfig(
			level=log_level,
			format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
			)

			if ngpu >= 1 and torch.cuda.is_available():
			device = "cuda"
			else:
			device = "cpu"
			# 1. Set random-seed
			set_all_random_seed(seed)

			# 2. Build speech2vadsegment
			speechtext2timestamp_kwargs = dict(
			timestamp_infer_config=timestamp_infer_config,
			timestamp_model_file=timestamp_model_file,
			timestamp_cmvn_file=timestamp_cmvn_file,
			device=device,
			dtype=dtype,
			)
			logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs))
			speechtext2timestamp = Speech2Timestamp(**speechtext2timestamp_kwargs)

			preprocessor = LMPreprocessor(
			train=False,
			token_type=speechtext2timestamp.tp_train_args.token_type,
			token_list=speechtext2timestamp.tp_train_args.token_list,
			bpemodel=None,
			text_cleaner=None,
			g2p_type=None,
			text_name="text",
			non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols,
			split_with_space=split_with_space,
			seg_dict_file=seg_dict_file,
			)

			if output_dir is not None:
			writer = DatadirWriter(output_dir)
			tp_writer = writer[f"timestamp_prediction"]
			# ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
			else:
			tp_writer = None

			def _forward(
			data_path_and_name_and_type,
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			output_dir_v2: Optional[str] = None,
			fs: dict = None,
			param_dict: dict = None,
			**kwargs
			):
			output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
			writer = None
			if output_path is not None:
			writer = DatadirWriter(output_path)
			tp_writer = writer[f"timestamp_prediction"]
			else:
			tp_writer = None
			# 3. Build data-iterator
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, torch.Tensor):
			raw_inputs = raw_inputs.numpy()
			data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]

			loader = build_streaming_iterator(
			task_name="asr",
			preprocess_args=speechtext2timestamp.tp_train_args,
			data_path_and_name_and_type=data_path_and_name_and_type,
			dtype=dtype,
			batch_size=batch_size,
			key_file=key_file,
			num_workers=num_workers,
			preprocess_fn=preprocessor,
			)

			tp_result_list = []
			for keys, batch in loader:
			assert isinstance(batch, dict), type(batch)
			assert all(isinstance(s, str) for s in keys), keys
			_bs = len(next(iter(batch.values())))
			assert len(keys) == _bs, f"{len(keys)} != {_bs}"

			logging.info("timestamp predicting, utt_id: {}".format(keys))
			_batch = {'speech': batch['speech'],
			'speech_lengths': batch['speech_lengths'],
			'text_lengths': batch['text_lengths']}
			us_alphas, us_cif_peak = speechtext2timestamp(**_batch)

			for batch_id in range(_bs):
			key = keys[batch_id]
			token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
			ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token,
			force_time_shift=-3.0)
			logging.warning(ts_str)
			item = {'key': key, 'value': ts_str, 'timestamp': ts_list}
			if tp_writer is not None:
			tp_writer["tp_sync"][key + '#'] = ts_str
			tp_writer["tp_time"][key + '#'] = str(ts_list)
			tp_result_list.append(item)
			return tp_result_list

			return _forward


			def inference_launch(mode, **kwargs):
			if mode == "tp_norm":
			return inference_tp(**kwargs)
			else:
			logging.info("Unknown decoding mode: {}".format(mode))
			return None


			def get_parser():
			@@ -100,14 +254,6 @@
			return parser


			def inference_launch(mode, **kwargs):
			if mode == "tp_norm":
			from funasr.bin.tp_inference import inference_modelscope
			return inference_modelscope(**kwargs)
			else:
			logging.info("Unknown decoding mode: {}".format(mode))
			return None

			def main(cmd=None):
			print(get_commandline_args(), file=sys.stderr)
			parser = get_parser()
			@@ -135,7 +281,8 @@
			os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
			os.environ["CUDA_VISIBLE_DEVICES"] = gpuid

			inference_launch(**kwargs)
			inference_pipeline = inference_launch(**kwargs)
			return inference_pipeline(kwargs["data_path_and_name_and_type"])


			if __name__ == "__main__":