From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
fun_text_processing/text_normalization/en/verbalizers/post_processing.py | 101 +++++++++++++++++++++++++++-----------------------
1 files changed, 54 insertions(+), 47 deletions(-)
diff --git a/fun_text_processing/text_normalization/en/verbalizers/post_processing.py b/fun_text_processing/text_normalization/en/verbalizers/post_processing.py
index e0da98f..5d604f0 100644
--- a/fun_text_processing/text_normalization/en/verbalizers/post_processing.py
+++ b/fun_text_processing/text_normalization/en/verbalizers/post_processing.py
@@ -1,6 +1,3 @@
-
-
-
import os
import pynini
@@ -36,7 +33,7 @@
far_file = os.path.join(cache_dir, "en_tn_post_processing.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["post_process_graph"]
- logging.info(f'Post processing graph was restored from {far_file}.')
+ logging.info(f"Post processing graph was restored from {far_file}.")
else:
self.set_punct_dict()
self.fst = self.get_punct_postprocess_graph()
@@ -48,55 +45,55 @@
self.punct_marks = {
"'": [
"'",
- '麓',
- '使',
- '驶',
- '始',
- '式',
- '示',
- '藞',
- '藠',
- '藡',
- '舜',
- '痛',
- '蝿',
- '諝',
- '諠',
- '讬',
- '壮',
- '叽',
- '叩',
- '釕�',
- '釠�',
- '峋�',
- '峋�',
- '峥�',
- '峥�',
- '峥�',
- '鈥�',
- '鈥�',
- '鈥�',
- '鈥�',
- '鈥�',
- '隇�',
- '锛�',
- '锝�',
- '饢綉',
- '饢綊',
+ "麓",
+ "使",
+ "驶",
+ "始",
+ "式",
+ "示",
+ "藞",
+ "藠",
+ "藡",
+ "舜",
+ "痛",
+ "蝿",
+ "諝",
+ "諠",
+ "讬",
+ "壮",
+ "叽",
+ "叩",
+ "釕�",
+ "釠�",
+ "峋�",
+ "峋�",
+ "峥�",
+ "峥�",
+ "峥�",
+ "鈥�",
+ "鈥�",
+ "鈥�",
+ "鈥�",
+ "鈥�",
+ "隇�",
+ "锛�",
+ "锝�",
+ "饢綉",
+ "饢綊",
],
}
def get_punct_postprocess_graph(self):
"""
- Returns graph to post process punctuation marks.
+ Returns graph to post process punctuation marks.
- {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
- By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
+ {``} quotes are converted to {"}. Note, if there are spaces around single quote {'}, they will be kept.
+ By default, a space is added after a punctuation mark, and spaces are removed before punctuation marks.
"""
punct_marks_all = PunctuationFst().punct_marks
# no_space_before_punct assume no space before them
- quotes = ["'", "\"", "``", "芦"]
+ quotes = ["'", '"', "``", "芦"]
dashes = ["-", "鈥�"]
brackets = ["<", "{", "("]
open_close_single_quotes = [
@@ -105,7 +102,9 @@
open_close_double_quotes = [('"', '"'), ("``", "``"), ("鈥�", "鈥�")]
open_close_symbols = open_close_single_quotes + open_close_double_quotes
- allow_space_before_punct = ["&"] + quotes + dashes + brackets + [k[0] for k in open_close_symbols]
+ allow_space_before_punct = (
+ ["&"] + quotes + dashes + brackets + [k[0] for k in open_close_symbols]
+ )
no_space_before_punct = [m for m in punct_marks_all if m not in allow_space_before_punct]
no_space_before_punct = pynini.union(*no_space_before_punct)
@@ -119,7 +118,8 @@
graph = (
pynini.closure(non_punct)
+ pynini.closure(
- no_space_before_punct | pynutil.add_weight(delete_space + no_space_before_punct, MIN_NEG_WEIGHT)
+ no_space_before_punct
+ | pynutil.add_weight(delete_space + no_space_before_punct, MIN_NEG_WEIGHT)
)
+ pynini.closure(non_punct)
)
@@ -129,14 +129,21 @@
).optimize()
# remove space after no_space_after_punct (even if there are no matching closing brackets)
- no_space_after_punct = pynini.cdrewrite(delete_space, no_space_after_punct, DAMO_SIGMA, DAMO_SIGMA).optimize()
+ no_space_after_punct = pynini.cdrewrite(
+ delete_space, no_space_after_punct, DAMO_SIGMA, DAMO_SIGMA
+ ).optimize()
graph = pynini.compose(graph, no_space_after_punct).optimize()
# remove space around text in quotes
single_quote = pynutil.add_weight(pynini.accep("`"), MIN_NEG_WEIGHT)
double_quotes = pynutil.add_weight(pynini.accep('"'), MIN_NEG_WEIGHT)
quotes_graph = (
- single_quote + delete_space_optional + DAMO_ALPHA + DAMO_SIGMA + delete_space_optional + single_quote
+ single_quote
+ + delete_space_optional
+ + DAMO_ALPHA
+ + DAMO_SIGMA
+ + delete_space_optional
+ + single_quote
).optimize()
# this is to make sure multiple quotes are tagged from right to left without skipping any quotes in the left
--
Gitblit v1.9.1