From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
fun_text_processing/inverse_text_normalization/id/graph_utils.py | 60 ++++++++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 44 insertions(+), 16 deletions(-)
diff --git a/fun_text_processing/inverse_text_normalization/id/graph_utils.py b/fun_text_processing/inverse_text_normalization/id/graph_utils.py
index ccc9fa9..81a4da9 100644
--- a/fun_text_processing/inverse_text_normalization/id/graph_utils.py
+++ b/fun_text_processing/inverse_text_normalization/id/graph_utils.py
@@ -18,9 +18,9 @@
DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
DAMO_HEX = pynini.union(*string.hexdigits).optimize()
-DAMO_NON_BREAKING_SPACE = u"\u00A0"
+DAMO_NON_BREAKING_SPACE = "\u00A0"
DAMO_SPACE = " "
-DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
+DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
@@ -35,31 +35,55 @@
delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
pynutil.delete(" preserve_order: true")
- | (pynutil.delete(" field_order: \"") + DAMO_NOT_QUOTE + pynutil.delete("\""))
+ | (pynutil.delete(' field_order: "') + DAMO_NOT_QUOTE + pynutil.delete('"'))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
- "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
+ "b",
+ "c",
+ "d",
+ "f",
+ "g",
+ "h",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "p",
+ "q",
+ "r",
+ "s",
+ "t",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
)
_ies = DAMO_SIGMA + _c + pynini.cross("y", "ies")
_es = DAMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = DAMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
- suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA), DAMO_SIGMA
+ suppletive,
+ plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA),
+ DAMO_SIGMA,
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
-TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
+TO_LOWER = pynini.union(
+ *[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
+)
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
-def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
+def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
@@ -71,7 +95,7 @@
for rule, graph in graphs.items():
exporter[rule] = graph.optimize()
exporter.close()
- print(f'Created {file_name}')
+ print(f"Created {file_name}")
def get_plurals(fst):
@@ -98,7 +122,7 @@
return PLURAL_TO_SINGULAR @ fst
-def convert_space(fst) -> 'pynini.FstLike':
+def convert_space(fst) -> "pynini.FstLike":
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
@@ -109,7 +133,9 @@
Returns output fst where breaking spaces are converted to non breaking spaces
"""
- return fst @ pynini.cdrewrite(pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA)
+ return fst @ pynini.cdrewrite(
+ pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
+ )
class GraphFst:
@@ -129,9 +155,11 @@
self._fst = None
self.deterministic = deterministic
- self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
+ self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
if self.far_exist():
- self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
+ self._fst = Far(
+ self.far_path, mode="r", arc_type="standard", far_type="default"
+ ).get_fst()
def far_exist(self) -> bool:
"""
@@ -140,14 +168,14 @@
return self.far_path.exists()
@property
- def fst(self) -> 'pynini.FstLike':
+ def fst(self) -> "pynini.FstLike":
return self._fst
@fst.setter
def fst(self, fst):
self._fst = fst
- def add_tokens(self, fst) -> 'pynini.FstLike':
+ def add_tokens(self, fst) -> "pynini.FstLike":
"""
Wraps class name around to given fst
@@ -159,7 +187,7 @@
"""
return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
- def delete_tokens(self, fst) -> 'pynini.FstLike':
+ def delete_tokens(self, fst) -> "pynini.FstLike":
"""
Deletes class name wrap around output of given fst
@@ -178,4 +206,4 @@
+ delete_space
+ pynutil.delete("}")
)
- return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", DAMO_SIGMA)
+ return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
--
Gitblit v1.9.1