huangmingming
2023-03-13 49c00a7d6cb9c05d4bd0bb0fc8b59a2eed4b8950
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
 
 
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from fun_text_processing.text_normalization.ru.utils import get_abs_path
from pynini.lib import pynutil
 
 
class TimeFst(GraphFst):
    """
    Finite state transducer for classifying time, e.g.
        "02:15" -> time { hours: "два часа пятнадцать минут" }
    
    Args:
        number_names: number_names for cardinal and ordinal numbers
        deterministic: if True will provide a single transduction option,
            for False multiple transduction are generated (used for audio-based normalization)
    """
 
    def __init__(self, number_names: dict, deterministic: bool = True):
        super().__init__(name="time", kind="classify", deterministic=deterministic)
 
        increment_hour_ordinal = pynini.string_file(get_abs_path("data/time/increment_hour_ordinal.tsv"))
        increment_hour_cardinal = pynini.string_file(get_abs_path("data/time/increment_hour_cardinal.tsv"))
        convert_hour = pynini.string_file(get_abs_path("data/time/time_convert.tsv"))
 
        number = pynini.closure(pynini.cross("0", ""), 0, 1) + number_names['cardinal_names_nominative']
        hour_options = pynini.project(increment_hour_ordinal, "input")
        hour_options = hour_options | pynini.project(convert_hour, "output")
 
        hour_exeption_ends_with_one = pynini.union(*["01", "21"])
        hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"])
        hour_other = (
            pynini.difference(hour_options, pynini.union(hour_exeption_ends_with_one, hour_exeption_ends_rest))
        ).optimize()
 
        hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час")
        hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа")
        hour |= hour_other @ number + pynutil.insert(" часов")
 
        optional_and = pynini.closure(pynutil.insert("и "), 0, 1)
        digits = pynini.union(*[str(x) for x in range(10)])
        mins_start = pynini.union(*"012345")
        mins_options = mins_start + digits
        mins_exception_ends_with_one = mins_start + pynini.accep("1")
        mins_exception_ends_rest = pynini.difference(
            mins_start + pynini.union(*"234"), pynini.union(*["12", "13", "14"])
        )
        mins_other = pynini.difference(
            mins_options, pynini.union(mins_exception_ends_with_one, mins_exception_ends_rest)
        )
 
        minutes = mins_exception_ends_with_one @ number + pynutil.insert(" минута")
        minutes |= mins_exception_ends_rest @ number + pynutil.insert(" минуты")
        minutes |= mins_other @ number + pynutil.insert(" минут")
        self.minutes = minutes.optimize()
        # 17:15 -> "семнадцать часов и пятнадцать минут"
        hm = (
            pynutil.insert("hours: \"")
            + hour.optimize()
            + pynutil.insert("\"")
            + (pynini.cross(":", " ") + pynutil.insert("minutes: \"") + optional_and + minutes.optimize())
            + pynutil.insert("\"")
            + pynutil.insert(" preserve_order: true")
        )
        h = pynutil.insert("hours: \"") + hour + pynutil.insert("\"") + pynutil.delete(":00")
        self.graph_preserve_order = (hm | h).optimize()
 
        # 17:15 -> "пятнадцать минут шестого"
        # Requires permutations for the correct verbalization
        self.increment_hour_ordinal = pynini.compose(hour_options, increment_hour_ordinal).optimize()
        m_next_h = (
            pynutil.insert("hours: \"")
            + self.increment_hour_ordinal
            + pynutil.insert("\"")
            + pynini.cross(":", " ")
            + pynutil.insert("minutes: \"")
            + minutes
            + pynutil.insert("\"")
        )
 
        # 17:45 -> "без пятнадцати минут шесть"
        # Requires permutations for the correct verbalization
        self.mins_to_h = pynini.string_file(get_abs_path("data/time/minutes_to_hour.tsv")).optimize()
        self.increment_hour_cardinal = pynini.compose(hour_options, increment_hour_cardinal).optimize()
        m_to_h = (
            pynutil.insert("hours: \"")
            + self.increment_hour_cardinal
            + pynutil.insert("\"")
            + pynini.cross(":", " ")
            + pynutil.insert("minutes: \"без ")
            + self.mins_to_h
            + pynutil.insert("\"")
        )
 
        self.final_graph = m_next_h | self.graph_preserve_order | m_to_h
        self.fst = self.add_tokens(self.final_graph)
        self.fst = self.fst.optimize()