jmwang66
2023-05-09 8dab6d184a034ca86eafa644ea0d2100aadfe27d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
 
 
import pynini
from fun_text_processing.text_normalization.zh.graph_utils import GraphFst, insert_space
from fun_text_processing.text_normalization.zh.utils import get_abs_path
from pynini.lib import pynutil
 
 
class Time(GraphFst):
    '''
        1:02    -> tokens { time { hours: "1" minitus: "02" } }
        1:02:36 -> tokens { time { hours: "1" minutes: "02" seconds: "36" } }
        1:02 am -> tokens { time { hours: "1" minutes: "02" seconds: "36" suffix "am" } }
    '''
 
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="time", kind="classify", deterministic=deterministic)
 
        h = pynini.string_file(get_abs_path("data/time/hour.tsv"))
        time_tens = pynini.string_file(get_abs_path("data/time/tens.tsv"))
        time_digit = pynini.string_file(get_abs_path("data/time/digit.tsv"))
        time_zero = pynini.string_file(get_abs_path("data/time/zero.tsv"))
        time_digit = time_digit | time_zero
        time_suffix = pynini.string_file(get_abs_path("data/time/suffix.tsv"))
 
        m = time_tens + time_digit
        s = (time_tens + time_digit) | time_digit
 
        delete_colon = pynini.cross(':', ' ')
 
        # 5:05, 14:30
        h_m = (
            pynutil.insert('hours: \"')
            + h
            + pynutil.insert('\"')
            + delete_colon
            + pynutil.insert('minutes: \"')
            + m
            + pynutil.insert('\"')
        )
 
        # 1:30:15
        h_m_s = (
            pynutil.insert('hours: \"')
            + h
            + pynutil.insert('\"')
            + delete_colon
            + pynutil.insert('minutes: \"')
            + m
            + pynutil.insert('\"')
            + delete_colon
            + pynutil.insert('seconds: \"')
            + s
            + pynutil.insert('\"')
        )
 
        graph = h_m | h_m_s
        graph_suffix = graph + insert_space + pynutil.insert('suffix: \"') + time_suffix + pynutil.insert('\"')
        graph |= graph_suffix
        self.fst = self.add_tokens(graph).optimize()