From fe588bc508c0076bb007d6ed36c18ac8ecb341ac Mon Sep 17 00:00:00 2001
From: 王梦迪 <73778524+di-osc@users.noreply.github.com>
Date: 星期二, 20 五月 2025 16:10:59 +0800
Subject: [PATCH] Fsmn_vad支持多线程并发调用 (#2519)
---
funasr/datasets/llm_datasets/preprocessor.py | 33 +++++++++++++++------------------
1 files changed, 15 insertions(+), 18 deletions(-)
diff --git a/funasr/datasets/llm_datasets/preprocessor.py b/funasr/datasets/llm_datasets/preprocessor.py
index 9f20672..b99255e 100644
--- a/funasr/datasets/llm_datasets/preprocessor.py
+++ b/funasr/datasets/llm_datasets/preprocessor.py
@@ -16,22 +16,19 @@
from funasr.register import tables
-
@tables.register("preprocessor_classes", "TextPreprocessRemovePunctuation")
-class TextPreprocessSegDict(nn.Module):
- def __init__(self,
- **kwargs):
- super().__init__()
-
-
- def forward(self, text, **kwargs):
- # 瀹氫箟鑻辨枃鏍囩偣绗﹀彿
- en_punct = string.punctuation
- # 瀹氫箟涓枃鏍囩偣绗﹀彿锛堥儴鍒嗗父鐢ㄧ殑锛�
- cn_punct = '銆傦紵锛侊紝銆侊紱锛氣�溾�濃�樷�欙紙锛夈�娿�嬨�愩�戔�︹�旓綖路'
- # 鍚堝苟鑻辨枃鍜屼腑鏂囨爣鐐圭鍙�
- all_punct = en_punct + cn_punct
- # 鍒涘缓姝e垯琛ㄨ揪寮忔ā寮忥紝鍖归厤浠讳綍鍦╝ll_punct涓殑瀛楃
- punct_pattern = re.compile('[{}]'.format(re.escape(all_punct)))
- # 浣跨敤姝e垯琛ㄨ揪寮忕殑sub鏂规硶鏇挎崲鎺夎繖浜涘瓧绗�
- return punct_pattern.sub('', text)
+class TextPreprocessRemovePunctuation(nn.Module):
+ def __init__(self, **kwargs):
+ super().__init__()
+
+ def forward(self, text, **kwargs):
+ # 瀹氫箟鑻辨枃鏍囩偣绗﹀彿
+ en_punct = string.punctuation
+ # 瀹氫箟涓枃鏍囩偣绗﹀彿锛堥儴鍒嗗父鐢ㄧ殑锛�
+ cn_punct = "銆傦紵锛侊紝銆侊紱锛氣�溾�濃�樷�欙紙锛夈�娿�嬨�愩�戔�︹�旓綖路"
+ # 鍚堝苟鑻辨枃鍜屼腑鏂囨爣鐐圭鍙�
+ all_punct = en_punct + cn_punct
+ # 鍒涘缓姝e垯琛ㄨ揪寮忔ā寮忥紝鍖归厤浠讳綍鍦╝ll_punct涓殑瀛楃
+ punct_pattern = re.compile("[{}]".format(re.escape(all_punct)))
+ # 浣跨敤姝e垯琛ㄨ揪寮忕殑sub鏂规硶鏇挎崲鎺夎繖浜涘瓧绗�
+ return punct_pattern.sub("", text)
--
Gitblit v1.9.1