python/FunASR-XL.git

			@@ -1,27 +1,33 @@
			"""
			Author: Speech Lab, Alibaba Group, China
			"""

			import argparse
			import logging
			import os
			from pathlib import Path
			from typing import Callable
			from typing import Collection
			from typing import Dict
			from typing import List
			from typing import Optional
			from typing import Tuple
			from typing import Union

			import numpy as np
			import torch
			from typeguard import check_argument_types
			from typeguard import check_return_type
			import yaml

			from funasr.datasets.collate_fn import CommonCollateFn
			from funasr.datasets.preprocessor import CommonPreprocessor
			from funasr.layers.abs_normalize import AbsNormalize
			from funasr.layers.global_mvn import GlobalMVN
			from funasr.layers.utterance_mvn import UtteranceMVN
			from funasr.models.e2e_asr import ESPnetASRModel
			from funasr.models.e2e_asr import ASRModel
			from funasr.models.decoder.abs_decoder import AbsDecoder
			from funasr.models.encoder.abs_encoder import AbsEncoder
			from funasr.models.encoder.rnn_encoder import RNNEncoder
			from funasr.models.encoder.resnet34_encoder import ResNet34
			from funasr.models.encoder.resnet34_encoder import ResNet34, ResNet34_SP_L2Reg
			from funasr.models.pooling.statistic_pooling import StatisticPooling
			from funasr.models.decoder.sv_decoder import DenseDecoder
			from funasr.models.e2e_sv import ESPnetSVModel
			@@ -41,7 +47,7 @@
			from funasr.models.specaug.specaug import SpecAug
			from funasr.tasks.abs_task import AbsTask
			from funasr.torch_utils.initialize import initialize
			from funasr.train.abs_espnet_model import AbsESPnetModel
			from funasr.models.base_model import FunASRModel
			from funasr.train.class_choices import ClassChoices
			from funasr.train.trainer import Trainer
			from funasr.utils.types import float_or_none
			@@ -86,7 +92,7 @@
			classes=dict(
			espnet=ESPnetSVModel,
			),
			type_check=AbsESPnetModel,
			type_check=FunASRModel,
			default="espnet",
			)
			preencoder_choices = ClassChoices(
			@@ -103,6 +109,7 @@
			"encoder",
			classes=dict(
			resnet34=ResNet34,
			resnet34_sp_l2reg=ResNet34_SP_L2Reg,
			rnn=RNNEncoder,
			),
			type_check=AbsEncoder,
			@@ -264,7 +271,6 @@
			[Collection[Tuple[str, Dict[str, np.ndarray]]]],
			Tuple[List[str], Dict[str, torch.Tensor]],
			]:
			assert check_argument_types()
			# NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
			return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)

			@@ -272,7 +278,6 @@
			def build_preprocess_fn(
			cls, args: argparse.Namespace, train: bool
			) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
			assert check_argument_types()
			if args.use_preprocessor:
			retval = CommonPreprocessor(
			train=train,
			@@ -300,7 +305,6 @@
			)
			else:
			retval = None
			assert check_return_type(retval)
			return retval

			@classmethod
			@@ -321,12 +325,10 @@
			retval = ()
			if inference:
			retval = ("ref_speech",)
			assert check_return_type(retval)
			return retval

			@classmethod
			def build_model(cls, args: argparse.Namespace) -> ESPnetSVModel:
			assert check_argument_types()
			if isinstance(args.token_list, str):
			with open(args.token_list, encoding="utf-8") as f:
			token_list = [line.rstrip() for line in f]
			@@ -394,9 +396,16 @@

			# 7. Pooling layer
			pooling_class = pooling_choices.get_class(args.pooling_type)
			pooling_dim = (2, 3)
			eps = 1e-12
			if hasattr(args, "pooling_type_conf"):
			if "pooling_dim" in args.pooling_type_conf:
			pooling_dim = args.pooling_type_conf["pooling_dim"]
			if "eps" in args.pooling_type_conf:
			eps = args.pooling_type_conf["eps"]
			pooling_layer = pooling_class(
			pooling_dim=(2, 3),
			eps=1e-12,
			pooling_dim=pooling_dim,
			eps=eps,
			)
			if args.pooling_type == "statistic":
			encoder_output_size *= 2
			@@ -433,5 +442,95 @@
			if args.init is not None:
			initialize(model, args.init)

			assert check_return_type(model)
			return model

			# ~~~~~~~~~ The methods below are mainly used for inference ~~~~~~~~~
			@classmethod
			def build_model_from_file(
			cls,
			config_file: Union[Path, str] = None,
			model_file: Union[Path, str] = None,
			cmvn_file: Union[Path, str] = None,
			device: str = "cpu",
			):
			"""Build model from the files.

			This method is used for inference or fine-tuning.

			Args:
			config_file: The yaml file saved when training.
			model_file: The model file saved when training.
			cmvn_file: The cmvn file for front-end
			device: Device type, "cpu", "cuda", or "cuda:N".

			"""
			if config_file is None:
			assert model_file is not None, (
			"The argument 'model_file' must be provided "
			"if the argument 'config_file' is not specified."
			)
			config_file = Path(model_file).parent / "config.yaml"
			else:
			config_file = Path(config_file)

			with config_file.open("r", encoding="utf-8") as f:
			args = yaml.safe_load(f)
			if cmvn_file is not None:
			args["cmvn_file"] = cmvn_file
			args = argparse.Namespace(**args)
			model = cls.build_model(args)
			if not isinstance(model, FunASRModel):
			raise RuntimeError(
			f"model must inherit {FunASRModel.__name__}, but got {type(model)}"
			)
			model.to(device)
			model_dict = dict()
			model_name_pth = None
			if model_file is not None:
			logging.info("model_file is {}".format(model_file))
			if device == "cuda":
			device = f"cuda:{torch.cuda.current_device()}"
			model_dir = os.path.dirname(model_file)
			model_name = os.path.basename(model_file)
			if "model.ckpt-" in model_name or ".bin" in model_name:
			if ".bin" in model_name:
			model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
			else:
			model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
			if os.path.exists(model_name_pth):
			logging.info("model_file is load from pth: {}".format(model_name_pth))
			model_dict = torch.load(model_name_pth, map_location=device)
			else:
			model_dict = cls.convert_tf2torch(model, model_file)
			model.load_state_dict(model_dict)
			else:
			model_dict = torch.load(model_file, map_location=device)
			model.load_state_dict(model_dict)
			if model_name_pth is not None and not os.path.exists(model_name_pth):
			torch.save(model_dict, model_name_pth)
			logging.info("model_file is saved to pth: {}".format(model_name_pth))

			return model, args

			@classmethod
			def convert_tf2torch(
			cls,
			model,
			ckpt,
			):
			logging.info("start convert tf model to torch model")
			from funasr.modules.streaming_utils.load_fr_tf import load_tf_dict
			var_dict_tf = load_tf_dict(ckpt)
			var_dict_torch = model.state_dict()
			var_dict_torch_update = dict()
			# speech encoder
			var_dict_torch_update_local = model.encoder.convert_tf2torch(var_dict_tf, var_dict_torch)
			var_dict_torch_update.update(var_dict_torch_update_local)
			# pooling layer
			var_dict_torch_update_local = model.pooling_layer.convert_tf2torch(var_dict_tf, var_dict_torch)
			var_dict_torch_update.update(var_dict_torch_update_local)
			# decoder
			var_dict_torch_update_local = model.decoder.convert_tf2torch(var_dict_tf, var_dict_torch)
			var_dict_torch_update.update(var_dict_torch_update_local)

			return var_dict_torch_update