python/FunASR-XL.git

			@@ -1,3 +1,11 @@
			"""
			Author: Speech Lab, Alibaba Group, China
			SOND: Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis
			https://arxiv.org/abs/2211.10243
			TOLD: A Novel Two-Stage Overlap-Aware Framework for Speaker Diarization
			https://arxiv.org/abs/2303.05397
			"""

			import argparse
			import logging
			import os
			@@ -507,7 +515,7 @@
			config_file: Union[Path, str] = None,
			model_file: Union[Path, str] = None,
			cmvn_file: Union[Path, str] = None,
			device: str = "cpu",
			device: Union[str, torch.device] = "cpu",
			):
			"""Build model from the files.

			@@ -553,7 +561,7 @@
			if ".bin" in model_name:
			model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
			else:
			model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
			model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
			if os.path.exists(model_name_pth):
			logging.info("model_file is load from pth: {}".format(model_name_pth))
			model_dict = torch.load(model_name_pth, map_location=device)
			@@ -562,12 +570,27 @@
			model.load_state_dict(model_dict)
			else:
			model_dict = torch.load(model_file, map_location=device)
			model_dict = cls.fileter_model_dict(model_dict, model.state_dict())
			model.load_state_dict(model_dict)
			if model_name_pth is not None and not os.path.exists(model_name_pth):
			torch.save(model_dict, model_name_pth)
			logging.info("model_file is saved to pth: {}".format(model_name_pth))

			return model, args

			@classmethod
			def fileter_model_dict(cls, src_dict: dict, dest_dict: dict):
			from collections import OrderedDict
			new_dict = OrderedDict()
			for key, value in src_dict.items():
			if key in dest_dict:
			new_dict[key] = value
			else:
			logging.info("{} is no longer needed in this model.".format(key))
			for key, value in dest_dict.items():
			if key not in new_dict:
			logging.warning("{} is missed in checkpoint.".format(key))
			return new_dict

			@classmethod
			def convert_tf2torch(
			@@ -750,47 +773,47 @@
			cls, args: argparse.Namespace, train: bool
			) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
			assert check_argument_types()
			if args.use_preprocessor:
			retval = CommonPreprocessor(
			train=train,
			token_type=args.token_type,
			token_list=args.token_list,
			bpemodel=None,
			non_linguistic_symbols=None,
			text_cleaner=None,
			g2p_type=None,
			split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
			seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
			# NOTE(kamo): Check attribute existence for backward compatibility
			rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
			rir_apply_prob=args.rir_apply_prob
			if hasattr(args, "rir_apply_prob")
			else 1.0,
			noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
			noise_apply_prob=args.noise_apply_prob
			if hasattr(args, "noise_apply_prob")
			else 1.0,
			noise_db_range=args.noise_db_range
			if hasattr(args, "noise_db_range")
			else "13_15",
			speech_volume_normalize=args.speech_volume_normalize
			if hasattr(args, "rir_scp")
			else None,
			)
			else:
			retval = None
			assert check_return_type(retval)
			return retval
			# if args.use_preprocessor:
			# retval = CommonPreprocessor(
			# train=train,
			# token_type=args.token_type,
			# token_list=args.token_list,
			# bpemodel=None,
			# non_linguistic_symbols=None,
			# text_cleaner=None,
			# g2p_type=None,
			# split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
			# seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
			# # NOTE(kamo): Check attribute existence for backward compatibility
			# rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
			# rir_apply_prob=args.rir_apply_prob
			# if hasattr(args, "rir_apply_prob")
			# else 1.0,
			# noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
			# noise_apply_prob=args.noise_apply_prob
			# if hasattr(args, "noise_apply_prob")
			# else 1.0,
			# noise_db_range=args.noise_db_range
			# if hasattr(args, "noise_db_range")
			# else "13_15",
			# speech_volume_normalize=args.speech_volume_normalize
			# if hasattr(args, "rir_scp")
			# else None,
			# )
			# else:
			# retval = None
			# assert check_return_type(retval)
			return None

			@classmethod
			def required_data_names(
			cls, train: bool = True, inference: bool = False
			) -> Tuple[str, ...]:
			if not inference:
			retval = ("speech", "profile", "binary_labels")
			retval = ("speech", )
			else:
			# Recognition mode
			retval = ("speech")
			retval = ("speech", )
			return retval

			@classmethod
			@@ -823,7 +846,7 @@

			# 2. Encoder
			encoder_class = encoder_choices.get_class(args.encoder)
			encoder = encoder_class(input_size=input_size, **args.encoder_conf)
			encoder = encoder_class(**args.encoder_conf)

			# 3. EncoderDecoderAttractor
			encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)