| | |
| | | |
| | | from funasr.torch_utils.set_all_random_seed import set_all_random_seed |
| | | from funasr.utils import config_argparse |
| | | from funasr.utils.build_dataloader import build_dataloader |
| | | from funasr.utils.build_distributed import build_distributed |
| | | from funasr.utils.prepare_data import prepare_data |
| | | from funasr.utils.types import str2bool |
| | |
| | | format=f"[{os.uname()[1].split('.')[0]}]" |
| | | f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", |
| | | ) |
| | | logging.info("world size: {}, rank: {}, local_rank: {}".format(distributed_option.dist_world_size, |
| | | distributed_option.dist_rank, |
| | | distributed_option.local_rank)) |
| | | |
| | | # prepare files for dataloader |
| | | prepare_data(args, distributed_option) |
| | | |
| | | # set random seed |
| | | set_all_random_seed(args.seed) |
| | | torch.backends.cudnn.enabled = args.cudnn_enabled |
| | | torch.backends.cudnn.benchmark = args.cudnn_benchmark |
| | | torch.backends.cudnn.deterministic = args.cudnn_deterministic |
| | | |
| | | train_dataloader, valid_dataloader = build_dataloader(args) |
| | | |
| | | logging.info("world size: {}, rank: {}, local_rank: {}".format(distributed_option.dist_world_size, |
| | | distributed_option.dist_rank, |
| | | distributed_option.local_rank)) |
| | | |
| | | # optimizers = cls.build_optimizers(args, model=model) |
| | | # schedulers = [] |
| | | # for i, optim in enumerate(optimizers, 1): |
| | | # suf = "" if i == 1 else str(i) |
| | | # name = getattr(args, f"scheduler{suf}") |
| | | # conf = getattr(args, f"scheduler{suf}_conf") |
| | | # if name is not None: |
| | | # cls_ = scheduler_classes.get(name) |
| | | # if cls_ is None: |
| | | # raise ValueError( |
| | | # f"must be one of {list(scheduler_classes)}: {name}" |
| | | # ) |
| | | # scheduler = cls_(optim, **conf) |
| | | # else: |
| | | # scheduler = None |
| | | # |
| | | # schedulers.append(scheduler) |
| | |
| | | |
| | | |
| | | class SequenceIterFactory(AbsIterFactory): |
| | | """Build iterator for each epoch. |
| | | |
| | | """Build iterator for each epoch, modified from ESPnet |
| | | |
| | | """ |
| | | |
| | |
| | | args.batch_bins = args.batch_bins * args.ngpu |
| | | |
| | | # filter samples if wav.scp and text are mismatch |
| | | if (args.train_shape_file is None and args.dataset_type == "small") or args.train_data_file is None and args.dataset_type == "large": |
| | | if ( |
| | | args.train_shape_file is None and args.dataset_type == "small") or args.train_data_file is None and args.dataset_type == "large": |
| | | if not args.simple_ddp or distributed_option.dist_rank == 0: |
| | | filter_wav_text(args.data_dir, args.train_set) |
| | | filter_wav_text(args.data_dir, args.dev_set) |
| | |
| | | |
| | | if args.train_shape_file is None and args.dataset_type == "small": |
| | | if not args.simple_ddp or distributed_option.dist_rank == 0: |
| | | calc_shape(args.data_dir, args.train_set, args.frontend_conf, args.speech_length_min, args.speech_length_max) |
| | | calc_shape(args.data_dir, args.dev_set, args.frontend_conf, args.speech_length_min, args.speech_length_max) |
| | | calc_shape(args.data_dir, args.train_set, args.frontend_conf, args.speech_length_min, |
| | | args.speech_length_max) |
| | | calc_shape(args.data_dir, args.dev_set, args.frontend_conf, args.speech_length_min, |
| | | args.speech_length_max) |
| | | if args.simple_ddp: |
| | | dist.barrier() |
| | | args.train_shape_file = [os.path.join(args.data_dir, args.train_set, "speech_shape")] |
| | |
| | | if args.dataset_type == "large": |
| | | from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader |
| | | train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None, |
| | | frontend_conf=args.frontend_conf if hasattr(args, |
| | | "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, |
| | | "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="train") |
| | | valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None, |
| | | valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf, |
| | | frontend_conf=args.frontend_conf if hasattr(args, |
| | | "frontend_conf") else None, |
| | | seg_dict_file=args.seg_dict_file if hasattr(args, |
| | | "seg_dict_file") else None, |
| | | punc_dict_file=args.punc_list if hasattr(args, |
| | | "punc_list") else None, |
| | | bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None, |
| | | mode="eval") |
| | | elif args.dataset_type == "small": |
| | |
| | | from funasr.datasets.large_datasets.build_dataloader import LargeDataLoader |
| | | from funasr.datasets.small_datasets.build_dataloader import build_dataloader |
| | | from funasr.datasets.small_datasets.sequence_iter_factory import SequenceIterFactory |
| | | |
| | | |
| | | def build_dataloader(args): |
| | | if args.dataset_type == "small": |
| | | train_iter_factory = LargeDataLoader(args, mode="train") |
| | | valid_iter_factory = LargeDataLoader(args, mode="valid") |
| | | train_iter_factory = SequenceIterFactory(args, mode="train") |
| | | valid_iter_factory = SequenceIterFactory(args, mode="valid") |
| | | elif args.dataset_type == "large": |
| | | train_iter_factory = LargeDataLoader(args, mode="train") |
| | | valid_iter_factory = LargeDataLoader(args, mode="valid") |
| | | valid_iter_factory = LargeDataLoader(args, mode="valid") |
| | | else: |
| | | raise ValueError(f"Not supported dataset_type={args.dataset_type}") |
| | | raise ValueError(f"Not supported dataset_type={args.dataset_type}") |
| | | |
| | | return train_iter_factory, valid_iter_factory |
| New file |
| | |
| | | import logging |
| | | |
| | | def build_model(args): |
| | | if args.token_list is not None: |
| | | with open(args.token_list, encoding="utf-8") as f: |
| | | token_list = [line.rstrip() for line in f] |
| | | args.token_list = list(token_list) |
| | | vocab_size = len(token_list) |
| | | logging.info(f"Vocabulary size: {vocab_size}") |
| | | |
| | | |
| | | |
| | | |