From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交

---
 funasr/datasets/audio_datasets/sensevoice2jsonl.py |   17 +++++++++++------
 1 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/funasr/datasets/audio_datasets/sensevoice2jsonl.py b/funasr/datasets/audio_datasets/sensevoice2jsonl.py
index f58f43a..102e827 100644
--- a/funasr/datasets/audio_datasets/sensevoice2jsonl.py
+++ b/funasr/datasets/audio_datasets/sensevoice2jsonl.py
@@ -4,6 +4,7 @@
 import logging
 import hydra
 import re
+import string
 from omegaconf import DictConfig, OmegaConf
 import concurrent.futures
 import librosa
@@ -12,7 +13,7 @@
 
 
 def gen_jsonl_from_wav_text_list(
-    path, data_type_list=("source", "target"), jsonl_file_out: str = None, **kwargs
+    path, data_type_list=("source", "target"), jsonl_file_out: str = None, model_dir: str = "iic/SenseVoiceSmall", **kwargs
 ):
     try:
         rank = dist.get_rank()
@@ -58,7 +59,6 @@
         if "text_language" not in data_type_list or "emo_target" not in data_type_list or "event_target" not in data_type_list:
             from funasr import AutoModel
 
-            model_dir = "iic/SenseVoiceSmall"
             model = AutoModel(
                 model=model_dir,
             )
@@ -120,8 +120,11 @@
         dist.barrier()
 
 def contains_punctuation(s):
-    pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
-    return re.search(pattern, s) is not None
+    punctuations = (
+        string.punctuation +  
+        '锛屻�傘�侊紱锛氾紵锛�""''锛堬級銆愩�戙�娿�嬨�堛�夈�屻�嶃�庛�忋�斻�曪蓟锛斤經锝濓綖路鈥︹�斺��'  
+    )
+    return any(char in punctuations for char in s)
 
 def parse_context_length(data_list: list, data_type: str, id=0):
     pbar = tqdm(total=len(data_list), dynamic_ncols=True)
@@ -169,8 +172,9 @@
     jsonl_file_out = kwargs.get(
         "jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl"
     )
+    model_dir = kwargs.get("model_dir", "iic/SenseVoiceSmall")
     gen_jsonl_from_wav_text_list(
-        scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out
+        scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out, model_dir=model_dir
     )
 
 
@@ -178,7 +182,8 @@
 python -m funasr.datasets.audio_datasets.sensevoice2jsonl \
 ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt", "/Users/zhifu/funasr1.0/test_local/text_language.txt", "/Users/zhifu/funasr1.0/test_local/emo_target.txt", "/Users/zhifu/funasr1.0/test_local/event_target.txt"]' \
 ++data_type_list='["source", "target", "text_language", "emo_target", "event_target"]' \
-++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+++jsonl_file_out='/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl' \
+++model_dir='iic/SenseVoiceSmall'
 """
 
 if __name__ == "__main__":

--
Gitblit v1.9.1