sensevoice2jsonl.py punctuation matching fix (#2533)
* fix sensevoice2jsonl.py punctuation check
* fix sensevoice2jsonl.py punc check
| | |
| | | import logging |
| | | import hydra |
| | | import re |
| | | import string |
| | | from omegaconf import DictConfig, OmegaConf |
| | | import concurrent.futures |
| | | import librosa |
| | |
| | | dist.barrier() |
| | | |
| | | def contains_punctuation(s): |
| | | pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]' |
| | | return re.search(pattern, s) is not None |
| | | punctuations = ( |
| | | string.punctuation + |
| | | ',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–' |
| | | ) |
| | | return any(char in punctuations for char in s) |
| | | |
| | | def parse_context_length(data_list: list, data_type: str, id=0): |
| | | pbar = tqdm(total=len(data_list), dynamic_ncols=True) |