1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
| # Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
| #
| # Licensed under the Apache License, Version 2.0 (the "License");
| # you may not use this file except in compliance with the License.
| # You may obtain a copy of the License at
| #
| # http://www.apache.org/licenses/LICENSE-2.0
| #
| # Unless required by applicable law or agreed to in writing, software
| # distributed under the License is distributed on an "AS IS" BASIS,
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
| # See the License for the specific language governing permissions and
| # limitations under the License.
|
| # Adapted from https://github.com/google/TextNormalizationCoveringGrammars
| # Russian minimally supervised number grammar.
|
| import pynini
| from fun_text_processing.text_normalization.en.graph_utils import DAMO_NON_BREAKING_SPACE, DAMO_SPACE
| from fun_text_processing.text_normalization.ru.utils import get_abs_path
|
| RU_LOWER_ALPHA = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
| RU_UPPER_ALPHA = RU_LOWER_ALPHA.upper()
| RU_LOWER_ALPHA = pynini.union(*RU_LOWER_ALPHA).optimize()
| RU_UPPER_ALPHA = pynini.union(*RU_UPPER_ALPHA).optimize()
| RU_ALPHA = (RU_LOWER_ALPHA | RU_UPPER_ALPHA).optimize()
|
| RU_STRESSED_MAP = [
| ("А́", "А'"),
| ("Е́", "Е'"),
| ("Ё́", "Е'"),
| ("И́", "И'"),
| ("О́", "О'"),
| ("У́", "У'"),
| ("Ы́", "Ы'"),
| ("Э́", "Э'"),
| ("Ю́", "Ю'"),
| ("Я́", "Я'"),
| ("а́", "а'"),
| ("е́", "е'"),
| ("ё́", "е'"),
| ("и́", "и'"),
| ("о́", "о'"),
| ("у́", "у'"),
| ("ы́", "ы'"),
| ("э́", "э'"),
| ("ю́", "ю'"),
| ("я́", "я'"),
| ("ё", "е"),
| ("Ё", "Е"),
| ]
|
| REWRITE_STRESSED = pynini.closure(pynini.string_map(RU_STRESSED_MAP).optimize() | RU_ALPHA).optimize()
| TO_CYRILLIC = pynini.string_file(get_abs_path("data/latin_to_cyrillic.tsv")).optimize()
| TO_LATIN = pynini.invert(TO_CYRILLIC).optimize()
| RU_ALPHA_OR_SPACE = pynini.union(RU_ALPHA, DAMO_SPACE, DAMO_NON_BREAKING_SPACE).optimize()
|
|