liugz18
2024-07-18 d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pynini
from fun_text_processing.text_normalization.zh.graph_utils import GraphFst
from fun_text_processing.text_normalization.zh.utils import (
    UNIT_1e01,
    UNIT_1e02,
    UNIT_1e03,
    UNIT_1e04,
    get_abs_path,
)
from pynini.lib import pynutil
 
 
class Cardinal(GraphFst):
    """
    self.graph_cardinal:
    5       -> 五
    12      -> 十二
    213     -> 二百一十三
    3123    -> 三千一百二十三
    51234   -> 五万一千二百三十四
    51,234  -> 五万一千二百三十四
    0.125   -> 零点一二五
    self.fst:
    123     -> cardinal{ integer: "一二三" }
    """
 
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
        # base cardinal from digit to chinese
        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/number/digit_teen.tsv"))
        graph_sign = pynini.string_file(get_abs_path("data/number/sign.tsv"))
 
        graph_digit_with_zero = graph_digit | graph_zero  # from 0(read out) to 9
        graph_no_zero = pynini.cross("0", "")  # cardinalzero,in some case we don't read it out
        graph_digit_no_zero = graph_digit | graph_no_zero  # from 0(not read out) to 9
        insert_zero = pynutil.insert("零")
        delete_punct = pynutil.delete(",")  # to deal with 5,000,cardinalin english form
 
        # 15 in 215
        graph_ten_u = (graph_digit + pynutil.insert(UNIT_1e01) + graph_digit_no_zero) | (
            graph_zero + graph_digit
        )
        # 15 only
        graph_ten = (graph_teen + pynutil.insert(UNIT_1e01) + graph_digit_no_zero) | (
            graph_zero + graph_digit
        )
        # 215 or 200
        graph_hundred = (graph_digit + pynutil.insert(UNIT_1e02) + graph_ten_u) | (
            graph_digit + pynutil.insert(UNIT_1e02) + graph_no_zero**2
        )
        # 3000 or 3002 or 3012 or 3123
        # both with 3,000 or 3,002 or 3,012 or 3,123
        graph_thousand_sign = (
            (graph_digit + pynutil.insert(UNIT_1e03) + delete_punct + graph_hundred)
            | (
                graph_digit
                + pynutil.insert(UNIT_1e03)
                + delete_punct.ques
                + graph_zero
                + graph_digit
                + pynutil.insert(UNIT_1e01)
                + graph_digit_no_zero
            )
            | (
                graph_digit
                + pynutil.insert(UNIT_1e03)
                + delete_punct.ques
                + graph_zero
                + graph_no_zero
                + graph_digit
            )
            | (graph_digit + pynutil.insert(UNIT_1e03) + delete_punct.ques + graph_no_zero**3)
        )
        # 20001234 or 2001234 or 201234 or 21234 or 20234 or 20023 or 20002 or 20000
        # 8 digits max supported,for 9 digits cardinal,often write with unit instead of read it in only cardinalform
        graph_ten_thousand_sign = (
            (graph_thousand_sign | graph_hundred | graph_ten | graph_digit_no_zero)
            + pynutil.insert(UNIT_1e04)
            + (
                graph_thousand_sign
                | (graph_no_zero + delete_punct.ques + insert_zero + graph_hundred)
                | (
                    graph_no_zero
                    + delete_punct.ques
                    + graph_no_zero
                    + insert_zero
                    + (graph_digit + pynutil.insert(UNIT_1e01) + graph_digit_no_zero)
                )
                | (graph_no_zero + delete_punct.ques + graph_no_zero**2 + insert_zero + graph_digit)
                | (graph_no_zero**4)
            )
        )
 
        # cardinalstring like 123 or 123.456,used in phone cardinal,ID cardinal,etc.
        graph_numstring = pynini.closure(graph_digit_with_zero, 1) | (
            pynini.closure(graph_digit_with_zero, 1)
            + pynini.cross(".", "点")
            + pynini.closure(graph_digit_with_zero, 1)
        )
 
        graph = (
            graph_hundred
            | graph_ten
            | graph_digit_with_zero
            | graph_thousand_sign
            | graph_ten_thousand_sign
        )
        # 456.123
        graph_decimal = graph + pynini.cross(".", "点") + pynini.closure(graph_digit_with_zero, 1)
        graph_cardinal = graph | graph_decimal
        signed_cardinal = graph_sign + graph_cardinal
        self.graph_cardinal = (graph_cardinal | signed_cardinal).optimize()
 
        graph_numstring = self.add_tokens(
            pynutil.insert('integer: "') + graph_numstring + pynutil.insert('"')
        )
        self.fst = graph_numstring.optimize()