| | |
| | | import torch.distributed as dist |
| | | |
| | | |
| | | |
| | | def gen_jsonl_from_wav_text_list(path, data_type_list=("source", "target"), jsonl_file_out:str=None, **kwargs): |
| | | def gen_jsonl_from_wav_text_list( |
| | | path, data_type_list=("source", "target"), jsonl_file_out: str = None, **kwargs |
| | | ): |
| | | try: |
| | | rank = dist.get_rank() |
| | | world_size = dist.get_world_size() |
| | |
| | | if task_num > 1: |
| | | with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_cores) as executor: |
| | | |
| | | futures = [executor.submit(parse_context_length, data_file_lists[i*lines_for_each_th:(i+1)*lines_for_each_th], data_type) for i in range(task_num)] |
| | | futures = [ |
| | | executor.submit( |
| | | parse_context_length, |
| | | data_file_lists[ |
| | | i * lines_for_each_th : (i + 1) * lines_for_each_th |
| | | ], |
| | | data_type, |
| | | ) |
| | | for i in range(task_num) |
| | | ] |
| | | |
| | | for future in concurrent.futures.as_completed(futures): |
| | | |
| | |
| | | kwargs = OmegaConf.to_container(cfg, resolve=True) |
| | | print(kwargs) |
| | | |
| | | scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt")) |
| | | scp_file_list = kwargs.get( |
| | | "scp_file_list", |
| | | ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"), |
| | | ) |
| | | if isinstance(scp_file_list, str): |
| | | scp_file_list = eval(scp_file_list) |
| | | data_type_list = kwargs.get("data_type_list", ("source", "target")) |
| | | jsonl_file_out = kwargs.get("jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl") |
| | | gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out) |
| | | jsonl_file_out = kwargs.get( |
| | | "jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl" |
| | | ) |
| | | gen_jsonl_from_wav_text_list( |
| | | scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out |
| | | ) |
| | | |
| | | |
| | | """ |
| | |
| | | |
| | | if __name__ == "__main__": |
| | | main_hydra() |
| | | |
| | | |