嘉渊
2023-05-15 d2a64f2137ac23d1951fd2fa25b6053bba6f7873
funasr/utils/prepare_data.py
@@ -27,7 +27,7 @@
        parts = line.strip().split()
        if len(parts) < 2:
            continue
        text_dict[parts[0]] = " ".join(parts[1:]).lower()
        text_dict[parts[0]] = " ".join(parts[1:])
    filter_count = 0
    with open(wav_file, "w") as f_wav, open(text_file, "w") as f_text:
        for sample_name, wav_path in wav_dict.items():
@@ -181,6 +181,11 @@
            ["{}/{}/wav.scp".format(args.data_dir, args.valid_set), data_names[0], data_types[0]],
            ["{}/{}/text".format(args.data_dir, args.valid_set), data_names[1], data_types[1]]
        ]
        if args.embed_path is not None:
            args.train_data_path_and_name_and_type[0].append(
                "{}/embed/kaldi_ark".format(os.path.join(args.embed_path, "embeds", args.train_set, "embeds.scp")))
            args.valid_data_path_and_name_and_type[0].append(
                "{}/embed/kaldi_ark".format(os.path.join(args.embed_path, "embeds", args.dev_set, "embeds.scp")))
    else:
        args.train_data_file = os.path.join(args.data_dir, args.train_set, "data.list")
        args.valid_data_file = os.path.join(args.data_dir, args.valid_set, "data.list")