| | |
| | | and drop any diacritics (category 'Mn' and some manual mappings) |
| | | """ |
| | | return "".join( |
| | | c |
| | | if c in keep |
| | | else ADDITIONAL_DIACRITICS[c] |
| | | if c in ADDITIONAL_DIACRITICS |
| | | else "" |
| | | if unicodedata.category(c) == "Mn" |
| | | else " " |
| | | if unicodedata.category(c)[0] in "MSP" |
| | | else c |
| | | ( |
| | | c |
| | | if c in keep |
| | | else ( |
| | | ADDITIONAL_DIACRITICS[c] |
| | | if c in ADDITIONAL_DIACRITICS |
| | | else ( |
| | | "" |
| | | if unicodedata.category(c) == "Mn" |
| | | else " " if unicodedata.category(c)[0] in "MSP" else c |
| | | ) |
| | | ) |
| | | ) |
| | | for c in unicodedata.normalize("NFKD", s) |
| | | ) |
| | | |
| | |
| | | Replace any other markers, symbols, punctuations with a space, keeping diacritics |
| | | """ |
| | | return "".join( |
| | | " " if unicodedata.category(c)[0] in "MSP" else c |
| | | for c in unicodedata.normalize("NFKC", s) |
| | | " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s) |
| | | ) |
| | | |
| | | |
| | | class BasicTextNormalizer: |
| | | def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): |
| | | self.clean = ( |
| | | remove_symbols_and_diacritics if remove_diacritics else remove_symbols |
| | | ) |
| | | self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols |
| | | self.split_letters = split_letters |
| | | |
| | | def __call__(self, s: str): |
| | |
| | | if self.split_letters: |
| | | s = " ".join(regex.findall(r"\X", s, regex.U)) |
| | | |
| | | s = re.sub( |
| | | r"\s+", " ", s |
| | | ) # replace any successive whitespace characters with a space |
| | | s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space |
| | | |
| | | return s |