shixian.shi
2023-06-27 187a302be238d1e6a757871f77e42117ff221c64
update clas finetune
2个文件已修改
21 ■■■■■ 已修改文件
funasr/datasets/large_datasets/dataset.py 18 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/large_datasets/utils/tokenize.py 3 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/large_datasets/dataset.py
@@ -202,14 +202,7 @@
    data_types = conf.get("data_types", "kaldi_ark,text")
    pre_hwfile = conf.get("pre_hwlist", None)
    pre_prob = conf.get("pre_prob", 0)  # unused yet
    hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
                 "double_rate": conf.get("double_rate", 0.1),
                 "hotword_min_length": conf.get("hotword_min_length", 2),
                 "hotword_max_length": conf.get("hotword_max_length", 8),
                 "pre_prob": conf.get("pre_prob", 0.0)}
    # pre_prob = conf.get("pre_prob", 0)  # unused yet
    if pre_hwfile is not None:
        pre_hwlist = []
        with open(pre_hwfile, 'r') as fin:
@@ -218,6 +211,15 @@
    else:
        pre_hwlist = None
    hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
                 "double_rate": conf.get("double_rate", 0.1),
                 "hotword_min_length": conf.get("hotword_min_length", 2),
                 "hotword_max_length": conf.get("hotword_max_length", 8),
                 "pre_prob": conf.get("pre_prob", 0.0),
                 "pre_hwlist": pre_hwlist}
    dataset = AudioDataset(scp_lists, 
                           data_names, 
                           data_types, 
funasr/datasets/large_datasets/utils/tokenize.py
@@ -54,6 +54,9 @@
    length = len(text)
    if 'hw_tag' in data:
        if hw_config['pre_hwlist'] is not None and hw_config['pre_prob'] > 0:
            # enable preset hotword detect in sampling
            import pdb; pdb.set_trace()
        hotword_indxs = sample_hotword(length, **hw_config)
        data['hotword_indxs'] = hotword_indxs
        del data['hw_tag']