From 3445cd9652c71757507216ee68eb75699b32867c Mon Sep 17 00:00:00 2001
From: yuGAN6 <76163309+yuGAN6@users.noreply.github.com>
Date: 星期三, 28 五月 2025 10:33:26 +0800
Subject: [PATCH] sensevoice2jsonl.py punctuation matching fix (#2533)
---
funasr/datasets/audio_datasets/sensevoice2jsonl.py | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/funasr/datasets/audio_datasets/sensevoice2jsonl.py b/funasr/datasets/audio_datasets/sensevoice2jsonl.py
index df6ceae..102e827 100644
--- a/funasr/datasets/audio_datasets/sensevoice2jsonl.py
+++ b/funasr/datasets/audio_datasets/sensevoice2jsonl.py
@@ -4,6 +4,7 @@
import logging
import hydra
import re
+import string
from omegaconf import DictConfig, OmegaConf
import concurrent.futures
import librosa
@@ -119,8 +120,11 @@
dist.barrier()
def contains_punctuation(s):
- pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
- return re.search(pattern, s) is not None
+ punctuations = (
+ string.punctuation +
+ '锛屻�傘�侊紱锛氾紵锛�""''锛堬級銆愩�戙�娿�嬨�堛�夈�屻�嶃�庛�忋�斻�曪蓟锛斤經锝濓綖路鈥︹�斺��'
+ )
+ return any(char in punctuations for char in s)
def parse_context_length(data_list: list, data_type: str, id=0):
pbar = tqdm(total=len(data_list), dynamic_ncols=True)
--
Gitblit v1.9.1