1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
| # Copyright 2017 Google Inc.
|
|
| # Adapted from https://github.com/google/TextNormalizationCoveringGrammars
| # Russian minimally supervised number grammar.
|
| import pynini
| from fun_text_processing.text_normalization.en.graph_utils import (
| DAMO_NON_BREAKING_SPACE,
| DAMO_SPACE,
| )
| from fun_text_processing.text_normalization.ru.utils import get_abs_path
|
| RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
| RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
| RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
| RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
| RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
|
| RU_STRESSED_MAP = [
| ("А́", "А'"),
| ("Е́", "Е'"),
| ("Ё́", "Е'"),
| ("И́", "И'"),
| ("О́", "О'"),
| ("У́", "У'"),
| ("Ы́", "Ы'"),
| ("Э́", "Э'"),
| ("Ю́", "Ю'"),
| ("Я́", "Я'"),
| ("а́", "а'"),
| ("е́", "е'"),
| ("ё́", "е'"),
| ("и́", "и'"),
| ("о́", "о'"),
| ("у́", "у'"),
| ("ы́", "ы'"),
| ("э́", "э'"),
| ("ю́", "ю'"),
| ("я́", "я'"),
| ("ё", "е"),
| ("Ё", "Е"),
| ]
|
| REWRITE_STRESSED = pynini.closure(
| pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA
| ).optimize()
| TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
| TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
| RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()
|
|