游雁
2024-07-16 f0eec4c4da6c4a7fb16bebbda9a928d94b93fd1c
fun_text_processing/inverse_text_normalization/ko/graph_utils.py
@@ -1,4 +1,3 @@
import os
import string
from pathlib import Path
@@ -19,9 +18,9 @@
DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
DAMO_HEX = pynini.union(*string.hexdigits).optimize()
DAMO_NON_BREAKING_SPACE = u"\u00A0"
DAMO_NON_BREAKING_SPACE = "\u00A0"
DAMO_SPACE = " "
DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
@@ -36,31 +35,55 @@
delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
delete_preserve_order = pynini.closure(
    pynutil.delete(" preserve_order: true")
    | (pynutil.delete(" field_order: \"") + DAMO_NOT_QUOTE + pynutil.delete("\""))
    | (pynutil.delete(' field_order: "') + DAMO_NOT_QUOTE + pynutil.delete('"'))
)
suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
# _v = pynini.union("a", "e", "i", "o", "u")
_c = pynini.union(
    "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"
    "b",
    "c",
    "d",
    "f",
    "g",
    "h",
    "j",
    "k",
    "l",
    "m",
    "n",
    "p",
    "q",
    "r",
    "s",
    "t",
    "v",
    "w",
    "x",
    "y",
    "z",
)
_ies = DAMO_SIGMA + _c + pynini.cross("y", "ies")
_es = DAMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
_s = DAMO_SIGMA + pynutil.insert("s")
graph_plural = plurals._priority_union(
    suppletive, plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA), DAMO_SIGMA
    suppletive,
    plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA),
    DAMO_SIGMA,
).optimize()
SINGULAR_TO_PLURAL = graph_plural
PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
TO_LOWER = pynini.union(*[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)])
TO_LOWER = pynini.union(
    *[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
)
TO_UPPER = pynini.invert(TO_LOWER)
MIN_NEG_WEIGHT = -0.0001
MIN_POS_WEIGHT = 0.0001
def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']):
def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
    """
    Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
@@ -72,7 +95,7 @@
    for rule, graph in graphs.items():
        exporter[rule] = graph.optimize()
    exporter.close()
    print(f'Created {file_name}')
    print(f"Created {file_name}")
def get_plurals(fst):
@@ -99,7 +122,7 @@
    return PLURAL_TO_SINGULAR @ fst
def convert_space(fst) -> 'pynini.FstLike':
def convert_space(fst) -> "pynini.FstLike":
    """
    Converts space to nonbreaking space.
    Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
@@ -110,7 +133,9 @@
    Returns output fst where breaking spaces are converted to non breaking spaces
    """
    return fst @ pynini.cdrewrite(pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA)
    return fst @ pynini.cdrewrite(
        pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
    )
class GraphFst:
@@ -130,9 +155,11 @@
        self._fst = None
        self.deterministic = deterministic
        self.far_path = Path(os.path.dirname(__file__) + '/grammars/' + kind + '/' + name + '.far')
        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
        if self.far_exist():
            self._fst = Far(self.far_path, mode="r", arc_type="standard", far_type="default").get_fst()
            self._fst = Far(
                self.far_path, mode="r", arc_type="standard", far_type="default"
            ).get_fst()
    def far_exist(self) -> bool:
        """
@@ -141,14 +168,14 @@
        return self.far_path.exists()
    @property
    def fst(self) -> 'pynini.FstLike':
    def fst(self) -> "pynini.FstLike":
        return self._fst
    @fst.setter
    def fst(self, fst):
        self._fst = fst
    def add_tokens(self, fst) -> 'pynini.FstLike':
    def add_tokens(self, fst) -> "pynini.FstLike":
        """
        Wraps class name around to given fst
@@ -160,7 +187,7 @@
        """
        return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
    def delete_tokens(self, fst) -> 'pynini.FstLike':
    def delete_tokens(self, fst) -> "pynini.FstLike":
        """
        Deletes class name wrap around output of given fst
@@ -179,4 +206,4 @@
            + delete_space
            + pynutil.delete("}")
        )
        return res @ pynini.cdrewrite(pynini.cross(u"\u00A0", " "), "", "", DAMO_SIGMA)
        return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)