python/FunASR-XL.git

			@@ -30,15 +30,19 @@
			and drop any diacritics (category 'Mn' and some manual mappings)
			"""
			return "".join(
			c
			if c in keep
			else ADDITIONAL_DIACRITICS[c]
			if c in ADDITIONAL_DIACRITICS
			else ""
			if unicodedata.category(c) == "Mn"
			else " "
			if unicodedata.category(c)[0] in "MSP"
			else c
			(
			c
			if c in keep
			else (
			ADDITIONAL_DIACRITICS[c]
			if c in ADDITIONAL_DIACRITICS
			else (
			""
			if unicodedata.category(c) == "Mn"
			else " " if unicodedata.category(c)[0] in "MSP" else c
			)
			)
			)
			for c in unicodedata.normalize("NFKD", s)
			)

			@@ -48,16 +52,13 @@
			Replace any other markers, symbols, punctuations with a space, keeping diacritics
			"""
			return "".join(
			" " if unicodedata.category(c)[0] in "MSP" else c
			for c in unicodedata.normalize("NFKC", s)
			" " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
			)


			class BasicTextNormalizer:
			def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
			self.clean = (
			remove_symbols_and_diacritics if remove_diacritics else remove_symbols
			)
			self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
			self.split_letters = split_letters

			def __call__(self, s: str):
			@@ -69,8 +70,6 @@
			if self.split_letters:
			s = " ".join(regex.findall(r"\X", s, regex.U))

			s = re.sub(
			r"\s+", " ", s
			) # replace any successive whitespace characters with a space
			s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space

			return s