Yabin Li
2023-08-21 63b3e2040dd11be22bfda65d846e91a2dbd43862
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
 
 
# Copyright 2017 Google Inc.
 
 
# Adapted from https://github.com/google/TextNormalizationCoveringGrammars
# Russian minimally supervised number grammar.
 
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NON_BREAKING_SPACE, DAMO_SPACE
from fun_text_processing.text_normalization.ru.utils import get_abs_path
 
RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
 
RU_STRESSED_MAP = [
    ("А́", "А'"),
    ("Е́", "Е'"),
    ("Ё́", "Е'"),
    ("И́", "И'"),
    ("О́", "О'"),
    ("У́", "У'"),
    ("Ы́", "Ы'"),
    ("Э́", "Э'"),
    ("Ю́", "Ю'"),
    ("Я́", "Я'"),
    ("а́", "а'"),
    ("е́", "е'"),
    ("ё́", "е'"),
    ("и́", "и'"),
    ("о́", "о'"),
    ("у́", "у'"),
    ("ы́", "ы'"),
    ("э́", "э'"),
    ("ю́", "ю'"),
    ("я́", "я'"),
    ("ё", "е"),
    ("Ё", "Е"),
]
 
REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()