From ba589e05c1448d0487198f8603cba247f22d67e1 Mon Sep 17 00:00:00 2001
From: Shi Xian <40013335+R1ckShi@users.noreply.github.com>
Date: 星期二, 27 二月 2024 10:43:27 +0800
Subject: [PATCH] Merge pull request #1393 from alibaba-damo-academy/dev_gzf

---
 funasr/datasets/llm_datasets/preprocessor.py |   37 +++++++++++++++++++++++++++++++++++++
 1 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/funasr/datasets/llm_datasets/preprocessor.py b/funasr/datasets/llm_datasets/preprocessor.py
new file mode 100644
index 0000000..9f20672
--- /dev/null
+++ b/funasr/datasets/llm_datasets/preprocessor.py
@@ -0,0 +1,37 @@
+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from typing import Collection
+import torch
+import torchaudio
+from torch import nn
+import random
+import re
+import string
+from funasr.tokenizer.cleaner import TextCleaner
+from funasr.register import tables
+
+
+
+@tables.register("preprocessor_classes", "TextPreprocessRemovePunctuation")
+class TextPreprocessSegDict(nn.Module):
+	def __init__(self,
+	             **kwargs):
+		super().__init__()
+		
+	
+	def forward(self, text, **kwargs):
+		# 瀹氫箟鑻辨枃鏍囩偣绗﹀彿
+		en_punct = string.punctuation
+		# 瀹氫箟涓枃鏍囩偣绗﹀彿锛堥儴鍒嗗父鐢ㄧ殑锛�
+		cn_punct = '銆傦紵锛侊紝銆侊紱锛氣�溾�濃�樷�欙紙锛夈�娿�嬨�愩�戔�︹�旓綖路'
+		# 鍚堝苟鑻辨枃鍜屼腑鏂囨爣鐐圭鍙�
+		all_punct = en_punct + cn_punct
+		# 鍒涘缓姝e垯琛ㄨ揪寮忔ā寮忥紝鍖归厤浠讳綍鍦╝ll_punct涓殑瀛楃
+		punct_pattern = re.compile('[{}]'.format(re.escape(all_punct)))
+		# 浣跨敤姝e垯琛ㄨ揪寮忕殑sub鏂规硶鏇挎崲鎺夎繖浜涘瓧绗�
+		return punct_pattern.sub('', text)

--
Gitblit v1.9.1