From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
fun_text_processing/text_normalization/normalize.py | 149 ++++++++++++++++++++++++++++++-------------------
1 files changed, 92 insertions(+), 57 deletions(-)
diff --git a/fun_text_processing/text_normalization/normalize.py b/fun_text_processing/text_normalization/normalize.py
index 30d4941..c1cebdd 100644
--- a/fun_text_processing/text_normalization/normalize.py
+++ b/fun_text_processing/text_normalization/normalize.py
@@ -1,17 +1,3 @@
-# Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
import itertools
import os
import re
@@ -41,7 +27,7 @@
except (ModuleNotFoundError, ImportError) as e:
NLP_AVAILABLE = False
-SPACE_DUP = re.compile(' {2,}')
+SPACE_DUP = re.compile(" {2,}")
class Normalizer:
@@ -62,7 +48,7 @@
def __init__(
self,
input_case: str,
- lang: str = 'en',
+ lang: str = "en",
deterministic: bool = True,
cache_dir: str = None,
overwrite_cache: bool = False,
@@ -75,36 +61,62 @@
self.post_processor = None
if lang == "en":
- from fun_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
- from fun_text_processing.text_normalization.en.verbalizers.post_processing import PostProcessingFst
+ from fun_text_processing.text_normalization.en.verbalizers.verbalize_final import (
+ VerbalizeFinalFst,
+ )
+ from fun_text_processing.text_normalization.en.verbalizers.post_processing import (
+ PostProcessingFst,
+ )
if post_process:
- self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache)
+ self.post_processor = PostProcessingFst(
+ cache_dir=cache_dir, overwrite_cache=overwrite_cache
+ )
if deterministic:
- from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
+ from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify import (
+ ClassifyFst,
+ )
else:
if lm:
- from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_lm import ClassifyFst
+ from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_lm import (
+ ClassifyFst,
+ )
else:
from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_with_audio import (
ClassifyFst,
)
- elif lang == 'ru':
+ elif lang == "ru":
# Ru TN only support non-deterministic cases and produces multiple normalization options
# use normalize_with_audio.py
- from fun_text_processing.text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst
- from fun_text_processing.text_normalization.ru.verbalizers.verbalize_final import VerbalizeFinalFst
- elif lang == 'de':
- from fun_text_processing.text_normalization.de.taggers.tokenize_and_classify import ClassifyFst
- from fun_text_processing.text_normalization.de.verbalizers.verbalize_final import VerbalizeFinalFst
- elif lang == 'es':
- from fun_text_processing.text_normalization.es.taggers.tokenize_and_classify import ClassifyFst
- from fun_text_processing.text_normalization.es.verbalizers.verbalize_final import VerbalizeFinalFst
- elif lang == 'zh':
- from fun_text_processing.text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst
- from fun_text_processing.text_normalization.zh.verbalizers.verbalize_final import VerbalizeFinalFst
+ from fun_text_processing.text_normalization.ru.taggers.tokenize_and_classify import (
+ ClassifyFst,
+ )
+ from fun_text_processing.text_normalization.ru.verbalizers.verbalize_final import (
+ VerbalizeFinalFst,
+ )
+ elif lang == "de":
+ from fun_text_processing.text_normalization.de.taggers.tokenize_and_classify import (
+ ClassifyFst,
+ )
+ from fun_text_processing.text_normalization.de.verbalizers.verbalize_final import (
+ VerbalizeFinalFst,
+ )
+ elif lang == "es":
+ from fun_text_processing.text_normalization.es.taggers.tokenize_and_classify import (
+ ClassifyFst,
+ )
+ from fun_text_processing.text_normalization.es.verbalizers.verbalize_final import (
+ VerbalizeFinalFst,
+ )
+ elif lang == "zh":
+ from fun_text_processing.text_normalization.zh.taggers.tokenize_and_classify import (
+ ClassifyFst,
+ )
+ from fun_text_processing.text_normalization.zh.verbalizers.verbalize_final import (
+ VerbalizeFinalFst,
+ )
self.tagger = ClassifyFst(
input_case=input_case,
deterministic=deterministic,
@@ -156,7 +168,9 @@
try:
normalized_texts = Parallel(n_jobs=n_jobs)(
- delayed(self.process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
+ delayed(self.process_batch)(
+ texts[i : i + batch], verbose, punct_pre_process, punct_post_process
+ )
for i in range(0, len(texts), batch)
)
except BaseException as e:
@@ -176,7 +190,10 @@
"""
normalized_lines = [
self.normalize(
- text, verbose=verbose, punct_pre_process=punct_pre_process, punct_post_process=punct_post_process
+ text,
+ verbose=verbose,
+ punct_pre_process=punct_pre_process,
+ punct_post_process=punct_post_process,
)
for text in tqdm(batch)
]
@@ -250,7 +267,11 @@
return splits
def normalize(
- self, text: str, verbose: bool = False, punct_pre_process: bool = False, punct_post_process: bool = False
+ self,
+ text: str,
+ verbose: bool = False,
+ punct_pre_process: bool = False,
+ punct_post_process: bool = False,
) -> str:
"""
Main function. Normalizes tokens from written to spoken form
@@ -298,10 +319,10 @@
break
if verbalizer_lattice is None:
raise ValueError(f"No permutations were generated from tokens {s}")
- output += ' ' + self.select_verbalizer(verbalizer_lattice)
- output = SPACE_DUP.sub(' ', output[1:])
+ output += " " + self.select_verbalizer(verbalizer_lattice)
+ output = SPACE_DUP.sub(" ", output[1:])
- if self.lang == "en" and hasattr(self, 'post_processor'):
+ if self.lang == "en" and hasattr(self, "post_processor"):
output = self.post_process(output)
if punct_post_process:
@@ -323,11 +344,11 @@
Returns list of sentences
"""
- lower_case_unicode = ''
- upper_case_unicode = ''
+ lower_case_unicode = ""
+ upper_case_unicode = ""
if self.lang == "ru":
- lower_case_unicode = '\u0430-\u04FF'
- upper_case_unicode = '\u0410-\u042F'
+ lower_case_unicode = "\u0430-\u04FF"
+ upper_case_unicode = "\u0410-\u042F"
# Read and split transcript by utterance (roughly, sentences)
split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.鈥潀\?鈥漒!鈥�)\s(?![0-9]+[a-z]*\.)"
@@ -353,10 +374,12 @@
subl = [""]
for k, v in perm:
if isinstance(v, str):
- subl = ["".join(x) for x in itertools.product(subl, [f"{k}: \"{v}\" "])]
+ subl = ["".join(x) for x in itertools.product(subl, [f'{k}: "{v}" '])]
elif isinstance(v, OrderedDict):
rec = self._permute(v)
- subl = ["".join(x) for x in itertools.product(subl, [f" {k} {{ "], rec, [f" }} "])]
+ subl = [
+ "".join(x) for x in itertools.product(subl, [f" {k} {{ "], rec, [f" }} "])
+ ]
elif isinstance(v, bool):
subl = ["".join(x) for x in itertools.product(subl, [f"{k}: true "])]
else:
@@ -394,7 +417,7 @@
return _helper("", tokens, 0)
- def find_tags(self, text: str) -> 'pynini.FstLike':
+ def find_tags(self, text: str) -> "pynini.FstLike":
"""
Given text use tagger Fst to tag text
@@ -406,7 +429,7 @@
lattice = text @ self.tagger.fst
return lattice
- def select_tag(self, lattice: 'pynini.FstLike') -> str:
+ def select_tag(self, lattice: "pynini.FstLike") -> str:
"""
Given tagged lattice return shortest path
@@ -418,7 +441,7 @@
tagged_text = pynini.shortestpath(lattice, nshortest=1, unique=True).string()
return tagged_text
- def find_verbalizer(self, tagged_text: str) -> 'pynini.FstLike':
+ def find_verbalizer(self, tagged_text: str) -> "pynini.FstLike":
"""
Given tagged text creates verbalization lattice
This is context-independent.
@@ -431,7 +454,7 @@
lattice = tagged_text @ self.verbalizer.fst
return lattice
- def select_verbalizer(self, lattice: 'pynini.FstLike') -> str:
+ def select_verbalizer(self, lattice: "pynini.FstLike") -> str:
"""
Given verbalized lattice return shortest path
@@ -445,7 +468,7 @@
# output = pynini.shortestpath(lattice, nshortest=1, unique=True).string()
return output
- def post_process(self, normalized_text: 'pynini.FstLike') -> str:
+ def post_process(self, normalized_text: "pynini.FstLike") -> str:
"""
Runs post processing graph on normalized text
@@ -469,22 +492,34 @@
input = parser.add_mutually_exclusive_group()
input.add_argument("--text", dest="input_string", help="input string", type=str)
input.add_argument("--input_file", dest="input_file", help="input file path", type=str)
- parser.add_argument('--output_file', dest="output_file", help="output file path", type=str)
- parser.add_argument("--language", help="language", choices=["en", "de", "es", "zh"], default="en", type=str)
+ parser.add_argument("--output_file", dest="output_file", help="output file path", type=str)
parser.add_argument(
- "--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str
+ "--language", help="language", choices=["en", "de", "es", "zh"], default="en", type=str
)
- parser.add_argument("--verbose", help="print info for debugging", action='store_true')
+ parser.add_argument(
+ "--input_case",
+ help="input capitalization",
+ choices=["lower_cased", "cased"],
+ default="cased",
+ type=str,
+ )
+ parser.add_argument("--verbose", help="print info for debugging", action="store_true")
parser.add_argument(
"--punct_post_process",
help="set to True to enable punctuation post processing to match input.",
action="store_true",
)
parser.add_argument(
- "--punct_pre_process", help="set to True to enable punctuation pre processing", action="store_true"
+ "--punct_pre_process",
+ help="set to True to enable punctuation pre processing",
+ action="store_true",
)
- parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true")
- parser.add_argument("--whitelist", help="path to a file with with whitelist", default=None, type=str)
+ parser.add_argument(
+ "--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true"
+ )
+ parser.add_argument(
+ "--whitelist", help="path to a file with with whitelist", default=None, type=str
+ )
parser.add_argument(
"--cache_dir",
help="path to a dir with .far grammar file. Set to None to avoid using cache",
--
Gitblit v1.9.1