From 861147c7308b91068ffa02724fdf74ee623a909e Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 24 四月 2024 16:03:38 +0800
Subject: [PATCH] Dev gzf exp (#1654)
---
funasr/models/sense_voice/whisper_lib/normalizers/basic.py | 33 ++++++++++++++++-----------------
1 files changed, 16 insertions(+), 17 deletions(-)
diff --git a/funasr/models/sense_voice/whisper_lib/normalizers/basic.py b/funasr/models/sense_voice/whisper_lib/normalizers/basic.py
index a824032..76addc5 100644
--- a/funasr/models/sense_voice/whisper_lib/normalizers/basic.py
+++ b/funasr/models/sense_voice/whisper_lib/normalizers/basic.py
@@ -30,15 +30,19 @@
and drop any diacritics (category 'Mn' and some manual mappings)
"""
return "".join(
- c
- if c in keep
- else ADDITIONAL_DIACRITICS[c]
- if c in ADDITIONAL_DIACRITICS
- else ""
- if unicodedata.category(c) == "Mn"
- else " "
- if unicodedata.category(c)[0] in "MSP"
- else c
+ (
+ c
+ if c in keep
+ else (
+ ADDITIONAL_DIACRITICS[c]
+ if c in ADDITIONAL_DIACRITICS
+ else (
+ ""
+ if unicodedata.category(c) == "Mn"
+ else " " if unicodedata.category(c)[0] in "MSP" else c
+ )
+ )
+ )
for c in unicodedata.normalize("NFKD", s)
)
@@ -48,16 +52,13 @@
Replace any other markers, symbols, punctuations with a space, keeping diacritics
"""
return "".join(
- " " if unicodedata.category(c)[0] in "MSP" else c
- for c in unicodedata.normalize("NFKC", s)
+ " " if unicodedata.category(c)[0] in "MSP" else c for c in unicodedata.normalize("NFKC", s)
)
class BasicTextNormalizer:
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
- self.clean = (
- remove_symbols_and_diacritics if remove_diacritics else remove_symbols
- )
+ self.clean = remove_symbols_and_diacritics if remove_diacritics else remove_symbols
self.split_letters = split_letters
def __call__(self, s: str):
@@ -69,8 +70,6 @@
if self.split_letters:
s = " ".join(regex.findall(r"\X", s, regex.U))
- s = re.sub(
- r"\s+", " ", s
- ) # replace any successive whitespace characters with a space
+ s = re.sub(r"\s+", " ", s) # replace any successive whitespace characters with a space
return s
--
Gitblit v1.9.1