chenmengzheAAA
2023-09-14 2a66366be4c2715870e4859fd5a5db6e8a9dc00a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
 
 
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
    DAMO_CHAR,
    DAMO_DIGIT,
    DAMO_SIGMA,
    DAMO_SPACE,
    GraphFst,
)
from fun_text_processing.text_normalization.es.utils import get_abs_path
from pynini.lib import pynutil
 
ordinal_exceptions = pynini.string_file(get_abs_path("data/fractions/ordinal_exceptions.tsv"))
higher_powers_of_ten = pynini.string_file(get_abs_path("data/fractions/powers_of_ten.tsv"))
 
 
class FractionFst(GraphFst):
    """
    Finite state transducer for classifying fraction
    "23 4/5" ->
    tokens { fraction { integer: "veintitrés" numerator: "cuatro" denominator: "quinto" mophosyntactic_features: "ordinal" } }
 
    Args:
        cardinal: CardinalFst
        ordinal: OrdinalFst
        deterministic: if True will provide a single transduction option,
            for False multiple transduction are generated (used for audio-based normalization)
    """
 
    def __init__(self, cardinal: GraphFst, ordinal: GraphFst, deterministic: bool = True):
        super().__init__(name="fraction", kind="classify", deterministic=deterministic)
        cardinal_graph = cardinal.graph
        ordinal_graph = ordinal.graph
 
        # 2-10 are all ordinals
        three_to_ten = pynini.string_map(["2", "3", "4", "5", "6", "7", "8", "9", "10",])
        block_three_to_ten = pynutil.delete(three_to_ten)  # To block cardinal productions
        if not deterministic:  # Multiples of tens are sometimes rendered as ordinals
            three_to_ten |= pynini.string_map(["20", "30", "40", "50", "60", "70", "80", "90",])
        graph_three_to_ten = three_to_ten @ ordinal_graph
        graph_three_to_ten @= pynini.cdrewrite(ordinal_exceptions, "", "", DAMO_SIGMA)
 
        # Higher powers of tens (and multiples) are converted to ordinals.
        hundreds = pynini.string_map(["100", "200", "300", "400", "500", "600", "700", "800", "900",])
        graph_hundreds = hundreds @ ordinal_graph
 
        multiples_of_thousand = ordinal.multiples_of_thousand  # So we can have X milésimos
 
        graph_higher_powers_of_ten = (
            pynini.closure(ordinal.one_to_one_thousand + DAMO_SPACE, 0, 1)
            + pynini.closure("mil ", 0, 1)
            + pynini.closure(ordinal.one_to_one_thousand + DAMO_SPACE, 0, 1)
        )  # x millones / x mil millones / x mil z millones
        graph_higher_powers_of_ten += higher_powers_of_ten
        graph_higher_powers_of_ten = cardinal_graph @ graph_higher_powers_of_ten
        graph_higher_powers_of_ten @= pynini.cdrewrite(
            pynutil.delete("un "), pynini.accep("[BOS]"), pynini.project(higher_powers_of_ten, "output"), DAMO_SIGMA
        )  # we drop 'un' from these ordinals (millionths, not one-millionths)
 
        graph_higher_powers_of_ten = multiples_of_thousand | graph_hundreds | graph_higher_powers_of_ten
        block_higher_powers_of_ten = pynutil.delete(
            pynini.project(graph_higher_powers_of_ten, "input")
        )  # For cardinal graph
 
        graph_fractions_ordinals = graph_higher_powers_of_ten | graph_three_to_ten
        graph_fractions_ordinals += pynutil.insert(
            "\" morphosyntactic_features: \"ordinal\""
        )  # We note the root for processing later
 
        # Blocking the digits and hundreds from Cardinal graph
        graph_fractions_cardinals = pynini.cdrewrite(
            block_three_to_ten | block_higher_powers_of_ten, pynini.accep("[BOS]"), pynini.accep("[EOS]"), DAMO_SIGMA
        )
        graph_fractions_cardinals @= DAMO_CHAR.plus @ pynini.cdrewrite(
            pynutil.delete("0"), pynini.accep("[BOS]"), pynini.accep("[EOS]"), DAMO_SIGMA
        )  # Empty characters become '0' for DAMO_CHAR fst, so need to block
        graph_fractions_cardinals @= cardinal_graph
        graph_fractions_cardinals += pynutil.insert(
            "\" morphosyntactic_features: \"add_root\""
        )  # blocking these entries to reduce erroneous possibilities in debugging
 
        if deterministic:
            graph_fractions_cardinals = (
                pynini.closure(DAMO_DIGIT, 1, 2) @ graph_fractions_cardinals
            )  # Past hundreds the conventional scheme can be hard to read. For determinism we stop here
 
        graph_denominator = pynini.union(
            graph_fractions_ordinals,
            graph_fractions_cardinals,
            pynutil.add_weight(cardinal_graph + pynutil.insert("\""), 0.001),
        )  # Last form is simply recording the cardinal. Weighting so last resort
 
        integer = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\"") + DAMO_SPACE
        numerator = (
            pynutil.insert("numerator: \"") + cardinal_graph + (pynini.cross("/", "\" ") | pynini.cross(" / ", "\" "))
        )
        denominator = pynutil.insert("denominator: \"") + graph_denominator
 
        self.graph = pynini.closure(integer, 0, 1) + numerator + denominator
 
        final_graph = self.add_tokens(self.graph)
        self.fst = final_graph.optimize()