北念
2024-07-10 5448e926a215066193f8c5a12e0c7dfe55c29579
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
from time import perf_counter
from argparse import ArgumentParser
from fun_text_processing.text_normalization.en.graph_utils import generator_main
 
 
def parse_args():
    parser = ArgumentParser()
 
    parser.add_argument(
        "--language",
        help="language",
        choices=["de", "en", "es", "fr", "id", "ja", "ko", "pt", "ru", "vi", "zh"],
        default="en",
        type=str,
    )
    parser.add_argument(
        "--export_dir",
        help="path to export directory. Default to current directory.",
        default="./",
        type=str,
    )
    return parser.parse_args()
 
 
def get_grammars(lang: str = "en"):
    if lang == "de":
        from fun_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "en":
        from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "es":
        from fun_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "fr":
        from fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "id":
        from fun_text_processing.inverse_text_normalization.id.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.id.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "ja":
        from fun_text_processing.inverse_text_normalization.ja.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "ko":
        from fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "pt":
        from fun_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "ru":
        from fun_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "vi":
        from fun_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    elif lang == "zh":
        from fun_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
    else:
        from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
            ClassifyFst,
        )
        from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
            VerbalizeFinalFst,
        )
 
    return ClassifyFst().fst, VerbalizeFinalFst().fst
 
 
if __name__ == "__main__":
    args = parse_args()
 
    export_dir = args.export_dir
    os.makedirs(export_dir, exist_ok=True)
    tagger_far_file = os.path.join(export_dir, args.language + "_itn_tagger.far")
    verbalizer_far_file = os.path.join(export_dir, args.language + "_itn_verbalizer.far")
 
    start_time = perf_counter()
    tagger_fst, verbalizer_fst = get_grammars(args.language)
    generator_main(tagger_far_file, {"tokenize_and_classify": tagger_fst})
    generator_main(verbalizer_far_file, {"verbalize": verbalizer_fst})
    print(f"Time to generate graph: {round(perf_counter() - start_time, 2)} sec")