From 98c94ab3ab0266482117343a064beeb6bd6bcedc Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 28 二月 2024 20:45:07 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR merge
---
funasr/datasets/llm_datasets/preprocessor.py | 37 +++++++++++++++++++++++++++++++++++++
1 files changed, 37 insertions(+), 0 deletions(-)
diff --git a/funasr/datasets/llm_datasets/preprocessor.py b/funasr/datasets/llm_datasets/preprocessor.py
new file mode 100644
index 0000000..9f20672
--- /dev/null
+++ b/funasr/datasets/llm_datasets/preprocessor.py
@@ -0,0 +1,37 @@
+import os
+import json
+import torch
+import logging
+import concurrent.futures
+import librosa
+import torch.distributed as dist
+from typing import Collection
+import torch
+import torchaudio
+from torch import nn
+import random
+import re
+import string
+from funasr.tokenizer.cleaner import TextCleaner
+from funasr.register import tables
+
+
+
+@tables.register("preprocessor_classes", "TextPreprocessRemovePunctuation")
+class TextPreprocessSegDict(nn.Module):
+ def __init__(self,
+ **kwargs):
+ super().__init__()
+
+
+ def forward(self, text, **kwargs):
+ # 瀹氫箟鑻辨枃鏍囩偣绗﹀彿
+ en_punct = string.punctuation
+ # 瀹氫箟涓枃鏍囩偣绗﹀彿锛堥儴鍒嗗父鐢ㄧ殑锛�
+ cn_punct = '銆傦紵锛侊紝銆侊紱锛氣�溾�濃�樷�欙紙锛夈�娿�嬨�愩�戔�︹�旓綖路'
+ # 鍚堝苟鑻辨枃鍜屼腑鏂囨爣鐐圭鍙�
+ all_punct = en_punct + cn_punct
+ # 鍒涘缓姝e垯琛ㄨ揪寮忔ā寮忥紝鍖归厤浠讳綍鍦╝ll_punct涓殑瀛楃
+ punct_pattern = re.compile('[{}]'.format(re.escape(all_punct)))
+ # 浣跨敤姝e垯琛ㄨ揪寮忕殑sub鏂规硶鏇挎崲鎺夎繖浜涘瓧绗�
+ return punct_pattern.sub('', text)
--
Gitblit v1.9.1