From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
examples/aishell/paraformer/utils/text_tokenize.py | 18 ++++++++----------
1 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/examples/aishell/paraformer/utils/text_tokenize.py b/examples/aishell/paraformer/utils/text_tokenize.py
index 962ea11..31500f1 100755
--- a/examples/aishell/paraformer/utils/text_tokenize.py
+++ b/examples/aishell/paraformer/utils/text_tokenize.py
@@ -4,7 +4,7 @@
def load_dict(seg_file):
seg_dict = {}
- with open(seg_file, 'r') as infile:
+ with open(seg_file, "r") as infile:
for line in infile:
s = line.strip().split()
key = s[0]
@@ -28,8 +28,7 @@
return word_list
-def tokenize(txt,
- seg_dict):
+def tokenize(txt, seg_dict):
out_txt = ""
pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
@@ -87,20 +86,19 @@
parser = get_parser()
args = parser.parse_args()
- txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), 'w')
- shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), 'w')
+ txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), "w")
+ shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), "w")
seg_dict = load_dict(args.seg_file)
- with open(args.text_file, 'r') as infile:
+ with open(args.text_file, "r") as infile:
for line in infile:
s = line.strip().split()
text_id = s[0]
text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
text = tokenize(text_list, seg_dict)
lens = len(text.strip().split())
- txt_writer.write(text_id + " " + text + '\n')
- shape_writer.write(text_id + " " + str(lens) + '\n')
+ txt_writer.write(text_id + " " + text + "\n")
+ shape_writer.write(text_id + " " + str(lens) + "\n")
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
-
--
Gitblit v1.9.1