yuGAN6
2025-05-28 3445cd9652c71757507216ee68eb75699b32867c
sensevoice2jsonl.py punctuation matching fix (#2533)

* fix sensevoice2jsonl.py punctuation check

* fix sensevoice2jsonl.py punc check
1个文件已修改
8 ■■■■ 已修改文件
funasr/datasets/audio_datasets/sensevoice2jsonl.py 8 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/audio_datasets/sensevoice2jsonl.py
@@ -4,6 +4,7 @@
import logging
import hydra
import re
import string
from omegaconf import DictConfig, OmegaConf
import concurrent.futures
import librosa
@@ -119,8 +120,11 @@
        dist.barrier()
def contains_punctuation(s):
    pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    return re.search(pattern, s) is not None
    punctuations = (
        string.punctuation +
        ',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–'
    )
    return any(char in punctuations for char in s)
def parse_context_length(data_list: list, data_type: str, id=0):
    pbar = tqdm(total=len(data_list), dynamic_ncols=True)