| | |
| | | import concurrent.futures |
| | | import librosa |
| | | import torch.distributed as dist |
| | | from tqdm import tqdm |
| | | |
| | | |
| | | def gen_jsonl_from_wav_text_list( |
| | |
| | | i * lines_for_each_th : (i + 1) * lines_for_each_th |
| | | ], |
| | | data_type, |
| | | i, |
| | | ) |
| | | for i in range(task_num) |
| | | ] |
| | |
| | | for key in json_dict[data_type_list[0]].keys(): |
| | | jsonl_line = {"key": key} |
| | | for data_file in data_type_list: |
| | | jsonl_line.update(json_dict[data_file][key]) |
| | | if key in json_dict[data_file]: |
| | | jsonl_line.update(json_dict[data_file][key]) |
| | | jsonl_line = json.dumps(jsonl_line, ensure_ascii=False) |
| | | f.write(jsonl_line + "\n") |
| | | f.flush() |
| | |
| | | dist.barrier() |
| | | |
| | | |
| | | def parse_context_length(data_list: list, data_type: str): |
| | | |
| | | def parse_context_length(data_list: list, data_type: str, id=0): |
| | | pbar = tqdm(total=len(data_list), dynamic_ncols=True) |
| | | res = {} |
| | | for i, line in enumerate(data_list): |
| | | key, line = line.strip().split(maxsplit=1) |
| | | pbar.update(1) |
| | | pbar.set_description(f"cpu: {id}") |
| | | lines = line.strip().split(maxsplit=1) |
| | | key = lines[0] |
| | | line = lines[1] if len(lines) > 1 else "" |
| | | line = line.strip() |
| | | if os.path.exists(line): |
| | | waveform, _ = librosa.load(line, sr=16000) |
| | | sample_num = len(waveform) |
| | | context_len = int(sample_num / 16000 * 1000 / 10) |
| | | if data_type == "source": |
| | | if os.path.exists(line): |
| | | waveform, _ = librosa.load(line, sr=16000) |
| | | sample_num = len(waveform) |
| | | context_len = int(sample_num * 1000 / 16000 / 10) |
| | | else: |
| | | print("source file not found: {}".format(line)) |
| | | continue |
| | | else: |
| | | context_len = len(line.split()) if " " in line else len(line) |
| | | res[key] = {data_type: line, f"{data_type}_len": context_len} |