From 559cc2c6e296bc80917a7408911f671dfcc2b68b Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期五, 12 五月 2023 17:25:54 +0800
Subject: [PATCH] update repo
---
egs/aishell2/transformer/utils/proce_text.py | 31 +++++++++++++++++++++++++++++++
1 files changed, 31 insertions(+), 0 deletions(-)
diff --git a/egs/aishell2/transformer/utils/proce_text.py b/egs/aishell2/transformer/utils/proce_text.py
new file mode 100755
index 0000000..9e517a4
--- /dev/null
+++ b/egs/aishell2/transformer/utils/proce_text.py
@@ -0,0 +1,31 @@
+
+import sys
+import re
+
+in_f = sys.argv[1]
+out_f = sys.argv[2]
+
+
+with open(in_f, "r", encoding="utf-8") as f:
+ lines = f.readlines()
+
+with open(out_f, "w", encoding="utf-8") as f:
+ for line in lines:
+ outs = line.strip().split(" ", 1)
+ if len(outs) == 2:
+ idx, text = outs
+ text = re.sub("</s>", "", text)
+ text = re.sub("<s>", "", text)
+ text = re.sub("@@", "", text)
+ text = re.sub("@", "", text)
+ text = re.sub("<unk>", "", text)
+ text = re.sub(" ", "", text)
+ text = text.lower()
+ else:
+ idx = outs[0]
+ text = " "
+
+ text = [x for x in text]
+ text = " ".join(text)
+ out = "{} {}\n".format(idx, text)
+ f.write(out)
--
Gitblit v1.9.1