游雁
2023-03-23 129cfcd9f283dea0d64f2e20b77662febc2d802c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
 
 
import pynini
from fun_text_processing.text_normalization.zh.graph_utils import GraphFst
from fun_text_processing.text_normalization.zh.utils import UNIT_1e01, UNIT_1e02, UNIT_1e03, UNIT_1e04, get_abs_path
from pynini.lib import pynutil
 
 
class Cardinal(GraphFst):
    '''
        self.graph_cardinal:
        5       -> 五
        12      -> 十二
        213     -> 二百一十三
        3123    -> 三千一百二十三
        51234   -> 五万一千二百三十四
        51,234  -> 五万一千二百三十四
        0.125   -> 零点一二五
        self.fst:
        123     -> cardinal{ integer: "一二三" }
    '''
 
    def __init__(self, deterministic: bool = True, lm: bool = False):
        super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
        # base cardinal from digit to chinese
        graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
        graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
        graph_teen = pynini.string_file(get_abs_path("data/number/digit_teen.tsv"))
        graph_sign = pynini.string_file(get_abs_path("data/number/sign.tsv"))
 
        graph_digit_with_zero = graph_digit | graph_zero  # from 0(read out) to 9
        graph_no_zero = pynini.cross("0", "")  # cardinalzero,in some case we don't read it out
        graph_digit_no_zero = graph_digit | graph_no_zero  # from 0(not read out) to 9
        insert_zero = pynutil.insert('零')
        delete_punct = pynutil.delete(",")  # to deal with 5,000,cardinalin english form
 
        # 15 in 215
        graph_ten_u = (graph_digit + pynutil.insert(UNIT_1e01) + graph_digit_no_zero) | (graph_zero + graph_digit)
        # 15 only
        graph_ten = (graph_teen + pynutil.insert(UNIT_1e01) + graph_digit_no_zero) | (graph_zero + graph_digit)
        # 215 or 200
        graph_hundred = (graph_digit + pynutil.insert(UNIT_1e02) + graph_ten_u) | (
            graph_digit + pynutil.insert(UNIT_1e02) + graph_no_zero ** 2
        )
        # 3000 or 3002 or 3012 or 3123
        # both with 3,000 or 3,002 or 3,012 or 3,123
        graph_thousand_sign = (
            (graph_digit + pynutil.insert(UNIT_1e03) + delete_punct + graph_hundred)
            | (
                graph_digit
                + pynutil.insert(UNIT_1e03)
                + delete_punct.ques
                + graph_zero
                + graph_digit
                + pynutil.insert(UNIT_1e01)
                + graph_digit_no_zero
            )
            | (graph_digit + pynutil.insert(UNIT_1e03) + delete_punct.ques + graph_zero + graph_no_zero + graph_digit)
            | (graph_digit + pynutil.insert(UNIT_1e03) + delete_punct.ques + graph_no_zero ** 3)
        )
        # 20001234 or 2001234 or 201234 or 21234 or 20234 or 20023 or 20002 or 20000
        # 8 digits max supported,for 9 digits cardinal,often write with unit instead of read it in only cardinalform
        graph_ten_thousand_sign = (
            (graph_thousand_sign | graph_hundred | graph_ten | graph_digit_no_zero)
            + pynutil.insert(UNIT_1e04)
            + (
                graph_thousand_sign
                | (graph_no_zero + delete_punct.ques + insert_zero + graph_hundred)
                | (
                    graph_no_zero
                    + delete_punct.ques
                    + graph_no_zero
                    + insert_zero
                    + (graph_digit + pynutil.insert(UNIT_1e01) + graph_digit_no_zero)
                )
                | (graph_no_zero + delete_punct.ques + graph_no_zero ** 2 + insert_zero + graph_digit)
                | (graph_no_zero ** 4)
            )
        )
 
        # cardinalstring like 123 or 123.456,used in phone cardinal,ID cardinal,etc.
        graph_numstring = pynini.closure(graph_digit_with_zero, 1) | (
            pynini.closure(graph_digit_with_zero, 1)
            + pynini.cross(".", "点")
            + pynini.closure(graph_digit_with_zero, 1)
        )
 
        graph = graph_hundred | graph_ten | graph_digit_with_zero | graph_thousand_sign | graph_ten_thousand_sign
        # 456.123
        graph_decimal = graph + pynini.cross('.', '点') + pynini.closure(graph_digit_with_zero, 1)
        graph_cardinal = graph | graph_decimal
        signed_cardinal = graph_sign + graph_cardinal
        self.graph_cardinal = (graph_cardinal | signed_cardinal).optimize()
 
        graph_numstring = self.add_tokens(pynutil.insert("integer: \"") + graph_numstring + pynutil.insert("\""))
        self.fst = graph_numstring.optimize()