王梦迪
2025-05-20 fe588bc508c0076bb007d6ed36c18ac8ecb341ac
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
from fun_text_processing.text_normalization.ru.utils import get_abs_path
from pynini.lib import pynutil
 
 
class TimeFst(GraphFst):
    """
    Finite state transducer for classifying time, e.g.
        "02:15" -> time { hours: "два часа пятнадцать минут" }
 
    Args:
        number_names: number_names for cardinal and ordinal numbers
        deterministic: if True will provide a single transduction option,
            for False multiple transduction are generated (used for audio-based normalization)
    """
 
    def __init__(self, number_names: dict, deterministic: bool = True):
        super().__init__(name="time", kind="classify", deterministic=deterministic)
 
        increment_hour_ordinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_ordinal.tsv")
        )
        increment_hour_cardinal = pynini.string_file(
            get_abs_path("data/time/increment_hour_cardinal.tsv")
        )
        convert_hour = pynini.string_file(get_abs_path("data/time/time_convert.tsv"))
 
        number = (
            pynini.closure(pynini.cross("0", ""), 0, 1) + number_names["cardinal_names_nominative"]
        )
        hour_options = pynini.project(increment_hour_ordinal, "input")
        hour_options = hour_options | pynini.project(convert_hour, "output")
 
        hour_exeption_ends_with_one = pynini.union(*["01", "21"])
        hour_exeption_ends_rest = pynini.union(*["02", "03", "04", "22", "23"])
        hour_other = (
            pynini.difference(
                hour_options, pynini.union(hour_exeption_ends_with_one, hour_exeption_ends_rest)
            )
        ).optimize()
 
        hour = hour_exeption_ends_with_one @ number + pynutil.insert(" час")
        hour |= hour_exeption_ends_rest @ number + pynutil.insert(" часа")
        hour |= hour_other @ number + pynutil.insert(" часов")
 
        optional_and = pynini.closure(pynutil.insert("и "), 0, 1)
        digits = pynini.union(*[str(x) for x in range(10)])
        mins_start = pynini.union(*"012345")
        mins_options = mins_start + digits
        mins_exception_ends_with_one = mins_start + pynini.accep("1")
        mins_exception_ends_rest = pynini.difference(
            mins_start + pynini.union(*"234"), pynini.union(*["12", "13", "14"])
        )
        mins_other = pynini.difference(
            mins_options, pynini.union(mins_exception_ends_with_one, mins_exception_ends_rest)
        )
 
        minutes = mins_exception_ends_with_one @ number + pynutil.insert(" минута")
        minutes |= mins_exception_ends_rest @ number + pynutil.insert(" минуты")
        minutes |= mins_other @ number + pynutil.insert(" минут")
        self.minutes = minutes.optimize()
        # 17:15 -> "семнадцать часов и пятнадцать минут"
        hm = (
            pynutil.insert('hours: "')
            + hour.optimize()
            + pynutil.insert('"')
            + (
                pynini.cross(":", " ")
                + pynutil.insert('minutes: "')
                + optional_and
                + minutes.optimize()
            )
            + pynutil.insert('"')
            + pynutil.insert(" preserve_order: true")
        )
        h = pynutil.insert('hours: "') + hour + pynutil.insert('"') + pynutil.delete(":00")
        self.graph_preserve_order = (hm | h).optimize()
 
        # 17:15 -> "пятнадцать минут шестого"
        # Requires permutations for the correct verbalization
        self.increment_hour_ordinal = pynini.compose(
            hour_options, increment_hour_ordinal
        ).optimize()
        m_next_h = (
            pynutil.insert('hours: "')
            + self.increment_hour_ordinal
            + pynutil.insert('"')
            + pynini.cross(":", " ")
            + pynutil.insert('minutes: "')
            + minutes
            + pynutil.insert('"')
        )
 
        # 17:45 -> "без пятнадцати минут шесть"
        # Requires permutations for the correct verbalization
        self.mins_to_h = pynini.string_file(
            get_abs_path("data/time/minutes_to_hour.tsv")
        ).optimize()
        self.increment_hour_cardinal = pynini.compose(
            hour_options, increment_hour_cardinal
        ).optimize()
        m_to_h = (
            pynutil.insert('hours: "')
            + self.increment_hour_cardinal
            + pynutil.insert('"')
            + pynini.cross(":", " ")
            + pynutil.insert('minutes: "без ')
            + self.mins_to_h
            + pynutil.insert('"')
        )
 
        self.final_graph = m_next_h | self.graph_preserve_order | m_to_h
        self.fst = self.add_tokens(self.final_graph)
        self.fst = self.fst.optimize()