shixian.shi
2023-06-27 25de54910e7b48552bdba2dd655fbcd64a07668e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
 
 
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_DIGIT, DAMO_SPACE, GraphFst, delete_extra_space
from fun_text_processing.text_normalization.es.utils import get_abs_path
from pynini.lib import pynutil
 
articles = pynini.union("de", "del", "el", "del año")
delete_leading_zero = (pynutil.delete("0") | (DAMO_DIGIT - "0")) + DAMO_DIGIT
month_numbers = pynini.string_file(get_abs_path("data/dates/months.tsv"))
 
 
class DateFst(GraphFst):
    """
    Finite state transducer for classifying date, e.g.
        "01.04.2010" -> date { day: "un" month: "enero" year: "dos mil diez" preserve_order: true }
        "marzo 4 2000" -> date { month: "marzo" day: "cuatro" year: "dos mil" }
        "1990-20-01" -> date { year: "mil novecientos noventa" day: "veinte" month: "enero" }
 
    Args:
        cardinal: cardinal GraphFst
        deterministic: if True will provide a single transduction option,
            for False multiple transduction are generated (used for audio-based normalization)
    """
 
    def __init__(self, cardinal: GraphFst, deterministic: bool):
        super().__init__(name="date", kind="classify", deterministic=deterministic)
 
        number_to_month = month_numbers.optimize()
        month_graph = pynini.project(number_to_month, "output")
 
        numbers = cardinal.graph
        optional_leading_zero = delete_leading_zero | DAMO_DIGIT
 
        # 01, 31, 1
        digit_day = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 32)]) @ numbers
        day = (pynutil.insert("day: \"") + digit_day + pynutil.insert("\"")).optimize()
 
        digit_month = optional_leading_zero @ pynini.union(*[str(x) for x in range(1, 13)])
        number_to_month = digit_month @ number_to_month
 
        month_name = (pynutil.insert("month: \"") + month_graph + pynutil.insert("\"")).optimize()
        month_number = (pynutil.insert("month: \"") + number_to_month + pynutil.insert("\"")).optimize()
 
        # prefer cardinal over year
        year = (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT, 1, 3)  # 90, 990, 1990
        year @= numbers
        self.year = year
 
        year_only = pynutil.insert("year: \"") + year + pynutil.insert("\"")
        year_with_articles = (
            pynutil.insert("year: \"") + pynini.closure(articles + DAMO_SPACE, 0, 1) + year + pynutil.insert("\"")
        )
 
        graph_dmy = (
            day
            + pynini.closure(pynutil.delete(" de"))
            + DAMO_SPACE
            + month_name
            + pynini.closure(DAMO_SPACE + year_with_articles, 0, 1)
        )
 
        graph_mdy = (  # English influences on language
            month_name + delete_extra_space + day + pynini.closure(DAMO_SPACE + year_with_articles, 0, 1)
        )
 
        separators = [".", "-", "/"]
        for sep in separators:
            year_optional = pynini.closure(pynini.cross(sep, DAMO_SPACE) + year_only, 0, 1)
            new_graph = day + pynini.cross(sep, DAMO_SPACE) + month_number + year_optional
            graph_dmy |= new_graph
            if not deterministic:
                new_graph = month_number + pynini.cross(sep, DAMO_SPACE) + day + year_optional
                graph_mdy |= new_graph
 
        dash = "-"
        day_optional = pynini.closure(pynini.cross(dash, DAMO_SPACE) + day, 0, 1)
        graph_ymd = DAMO_DIGIT ** 4 @ year_only + pynini.cross(dash, DAMO_SPACE) + month_number + day_optional
 
        final_graph = graph_dmy + pynutil.insert(" preserve_order: true")
        final_graph |= graph_ymd
        final_graph |= graph_mdy
 
        self.final_graph = final_graph.optimize()
        self.fst = self.add_tokens(self.final_graph).optimize()