zhifu gao
2024-03-27 2297d7515afbf7a081132f11cfc9e225d1c784e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os
import json
import torch
import logging
import hydra
from omegaconf import DictConfig, OmegaConf
import concurrent.futures
import librosa
import torch.distributed as dist
 
 
 
def gen_scp_from_jsonl(jsonl_file, data_type_list, wav_scp_file, text_file):
 
    wav_f = open(wav_scp_file, "w")
    text_f = open(text_file, "w")
    with open(jsonl_file, encoding='utf-8') as fin:
        for line in fin:
            data = json.loads(line.strip())
            
            prompt = data.get("prompt", "<ASR>")
            source = data[data_type_list[0]]
            target = data[data_type_list[1]]
            source_len = data.get("source_len", 1)
            target_len = data.get("target_len", 0)
            if "aishell" in source:
                target = target.replace(" ", "")
            key = data["key"]
            wav_f.write(f"{key}\t{source}\n")
            wav_f.flush()
            text_f.write(f"{key}\t{target}\n")
            text_f.flush()
 
    wav_f.close()
    text_f.close()
    
    
                    
@hydra.main(config_name=None, version_base=None)
def main_hydra(cfg: DictConfig):
 
    kwargs = OmegaConf.to_container(cfg, resolve=True)
    print(kwargs)
 
    scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
    if isinstance(scp_file_list, str):
        scp_file_list = eval(scp_file_list)
    data_type_list = kwargs.get("data_type_list", ("source", "target"))
    jsonl_file = kwargs.get("jsonl_file_in", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl")
    gen_scp_from_jsonl(jsonl_file, data_type_list, *scp_file_list)
    
 
"""
python -m funasr.datasets.audio_datasets.json2scp \
++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
++data_type_list='["source", "target"]' \
++jsonl_file_in=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
"""
 
if __name__ == "__main__":
    main_hydra()