jmwang66
2023-05-09 8dab6d184a034ca86eafa644ea0d2100aadfe27d
funasr/tasks/diar.py
@@ -1,3 +1,11 @@
"""
Author: Speech Lab, Alibaba Group, China
SOND: Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis
https://arxiv.org/abs/2211.10243
TOLD: A Novel Two-Stage Overlap-Aware Framework for Speaker Diarization
https://arxiv.org/abs/2303.05397
"""
import argparse
import logging
import os
@@ -507,7 +515,7 @@
            config_file: Union[Path, str] = None,
            model_file: Union[Path, str] = None,
            cmvn_file: Union[Path, str] = None,
            device: str = "cpu",
            device: Union[str, torch.device] = "cpu",
    ):
        """Build model from the files.
@@ -553,7 +561,7 @@
                if ".bin" in model_name:
                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                else:
                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)
@@ -562,12 +570,27 @@
                model.load_state_dict(model_dict)
            else:
                model_dict = torch.load(model_file, map_location=device)
        model_dict = cls.fileter_model_dict(model_dict, model.state_dict())
        model.load_state_dict(model_dict)
        if model_name_pth is not None and not os.path.exists(model_name_pth):
            torch.save(model_dict, model_name_pth)
            logging.info("model_file is saved to pth: {}".format(model_name_pth))
        return model, args
    @classmethod
    def fileter_model_dict(cls, src_dict: dict, dest_dict: dict):
        from collections import OrderedDict
        new_dict = OrderedDict()
        for key, value in src_dict.items():
            if key in dest_dict:
                new_dict[key] = value
            else:
                logging.info("{} is no longer needed in this model.".format(key))
        for key, value in dest_dict.items():
            if key not in new_dict:
                logging.warning("{} is missed in checkpoint.".format(key))
        return new_dict
    @classmethod
    def convert_tf2torch(
@@ -750,47 +773,47 @@
            cls, args: argparse.Namespace, train: bool
    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
        assert check_argument_types()
        if args.use_preprocessor:
            retval = CommonPreprocessor(
                train=train,
                token_type=args.token_type,
                token_list=args.token_list,
                bpemodel=None,
                non_linguistic_symbols=None,
                text_cleaner=None,
                g2p_type=None,
                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                # NOTE(kamo): Check attribute existence for backward compatibility
                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
                rir_apply_prob=args.rir_apply_prob
                if hasattr(args, "rir_apply_prob")
                else 1.0,
                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
                noise_apply_prob=args.noise_apply_prob
                if hasattr(args, "noise_apply_prob")
                else 1.0,
                noise_db_range=args.noise_db_range
                if hasattr(args, "noise_db_range")
                else "13_15",
                speech_volume_normalize=args.speech_volume_normalize
                if hasattr(args, "rir_scp")
                else None,
            )
        else:
            retval = None
        assert check_return_type(retval)
        return retval
        # if args.use_preprocessor:
        #     retval = CommonPreprocessor(
        #         train=train,
        #         token_type=args.token_type,
        #         token_list=args.token_list,
        #         bpemodel=None,
        #         non_linguistic_symbols=None,
        #         text_cleaner=None,
        #         g2p_type=None,
        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
        #         # NOTE(kamo): Check attribute existence for backward compatibility
        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
        #         rir_apply_prob=args.rir_apply_prob
        #         if hasattr(args, "rir_apply_prob")
        #         else 1.0,
        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
        #         noise_apply_prob=args.noise_apply_prob
        #         if hasattr(args, "noise_apply_prob")
        #         else 1.0,
        #         noise_db_range=args.noise_db_range
        #         if hasattr(args, "noise_db_range")
        #         else "13_15",
        #         speech_volume_normalize=args.speech_volume_normalize
        #         if hasattr(args, "rir_scp")
        #         else None,
        #     )
        # else:
        #     retval = None
        # assert check_return_type(retval)
        return None
    @classmethod
    def required_data_names(
            cls, train: bool = True, inference: bool = False
    ) -> Tuple[str, ...]:
        if not inference:
            retval = ("speech", "profile", "binary_labels")
            retval = ("speech", )
        else:
            # Recognition mode
            retval = ("speech")
            retval = ("speech", )
        return retval
    @classmethod
@@ -823,7 +846,7 @@
        # 2. Encoder
        encoder_class = encoder_choices.get_class(args.encoder)
        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
        encoder = encoder_class(**args.encoder_conf)
        # 3. EncoderDecoderAttractor
        encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)