1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
| # Referenced from https://github.com/hccho2/Tacotron-Wavenet-Vocoder-Korean
|
| import re
|
|
| class KoreanCleaner:
| @classmethod
| def _normalize_numbers(cls, text):
| number_to_kor = {
| "0": "영",
| "1": "일",
| "2": "이",
| "3": "삼",
| "4": "사",
| "5": "오",
| "6": "육",
| "7": "칠",
| "8": "팔",
| "9": "구",
| }
| new_text = "".join(
| number_to_kor[char] if char in number_to_kor.keys() else char
| for char in text
| )
| return new_text
|
| @classmethod
| def _normalize_english_text(cls, text):
| upper_alphabet_to_kor = {
| "A": "에이",
| "B": "비",
| "C": "씨",
| "D": "디",
| "E": "이",
| "F": "에프",
| "G": "지",
| "H": "에이치",
| "I": "아이",
| "J": "제이",
| "K": "케이",
| "L": "엘",
| "M": "엠",
| "N": "엔",
| "O": "오",
| "P": "피",
| "Q": "큐",
| "R": "알",
| "S": "에스",
| "T": "티",
| "U": "유",
| "V": "브이",
| "W": "더블유",
| "X": "엑스",
| "Y": "와이",
| "Z": "지",
| }
| new_text = re.sub("[a-z]+", lambda x: str.upper(x.group()), text)
| new_text = "".join(
| upper_alphabet_to_kor[char]
| if char in upper_alphabet_to_kor.keys()
| else char
| for char in new_text
| )
|
| return new_text
|
| @classmethod
| def normalize_text(cls, text):
| # stage 0 : text strip
| text = text.strip()
|
| # stage 1 : normalize numbers
| text = cls._normalize_numbers(text)
|
| # stage 2 : normalize english text
| text = cls._normalize_english_text(text)
| return text
|
|