| | |
| | | import torch.distributed as dist |
| | | |
| | | |
| | | |
| | | def gen_scp_from_jsonl(jsonl_file, data_type_list, wav_scp_file, text_file): |
| | | |
| | | wav_f = open(wav_scp_file, "w") |
| | | text_f = open(text_file, "w") |
| | | with open(jsonl_file, encoding='utf-8') as fin: |
| | | with open(jsonl_file, encoding="utf-8") as fin: |
| | | for line in fin: |
| | | data = json.loads(line.strip()) |
| | | |
| | | |
| | | prompt = data.get("prompt", "<ASR>") |
| | | source = data[data_type_list[0]] |
| | | target = data[data_type_list[1]] |
| | |
| | | |
| | | wav_f.close() |
| | | text_f.close() |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | @hydra.main(config_name=None, version_base=None) |
| | | def main_hydra(cfg: DictConfig): |
| | | |
| | | |
| | | kwargs = OmegaConf.to_container(cfg, resolve=True) |
| | | print(kwargs) |
| | | |
| | | scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt")) |
| | | scp_file_list = kwargs.get( |
| | | "scp_file_list", |
| | | ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"), |
| | | ) |
| | | if isinstance(scp_file_list, str): |
| | | scp_file_list = eval(scp_file_list) |
| | | data_type_list = kwargs.get("data_type_list", ("source", "target")) |
| | | jsonl_file = kwargs.get("jsonl_file_in", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl") |
| | | jsonl_file = kwargs.get( |
| | | "jsonl_file_in", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl" |
| | | ) |
| | | gen_scp_from_jsonl(jsonl_file, data_type_list, *scp_file_list) |
| | | |
| | | |
| | | |
| | | """ |
| | | python -m funasr.datasets.audio_datasets.json2scp \ |
| | |
| | | |
| | | if __name__ == "__main__": |
| | | main_hydra() |
| | | |
| | | |