1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
| import re
| import unicodedata
|
| import regex
|
| # non-ASCII letters that are not separated by "NFKD" normalization
| ADDITIONAL_DIACRITICS = {
| "œ": "oe",
| "Œ": "OE",
| "ø": "o",
| "Ø": "O",
| "æ": "ae",
| "Æ": "AE",
| "ß": "ss",
| "ẞ": "SS",
| "đ": "d",
| "Đ": "D",
| "ð": "d",
| "Ð": "D",
| "þ": "th",
| "Þ": "th",
| "ł": "l",
| "Ł": "L",
| }
|
|
| def remove_symbols_and_diacritics(s: str, keep=""):
| """
| Replace any other markers, symbols, and punctuations with a space,
| and drop any diacritics (category 'Mn' and some manual mappings)
| """
| return "".join(
| c
| if c in keep
| else ADDITIONAL_DIACRITICS[c]
| if c in ADDITIONAL_DIACRITICS
| else ""
| if unicodedata.category(c) == "Mn"
| else " "
| if unicodedata.category(c)[0] in "MSP"
| else c
| for c in unicodedata.normalize("NFKD", s)
| )
|
|
| def remove_symbols(s: str):
| """
| Replace any other markers, symbols, punctuations with a space, keeping diacritics
| """
| return "".join(
| " " if unicodedata.category(c)[0] in "MSP" else c
| for c in unicodedata.normalize("NFKC", s)
| )
|
|
| class BasicTextNormalizer:
| def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
| self.clean = (
| remove_symbols_and_diacritics if remove_diacritics else remove_symbols
| )
| self.split_letters = split_letters
|
| def __call__(self, s: str):
| s = s.lower()
| s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
| s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
| s = self.clean(s).lower()
|
| if self.split_letters:
| s = " ".join(regex.findall(r"\X", s, regex.U))
|
| s = re.sub(
| r"\s+", " ", s
| ) # replace any successive whitespace characters with a space
|
| return s
|
|