From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交

---
 fun_text_processing/text_normalization/normalize.py |  149 ++++++++++++++++++++++++++++++-------------------
 1 files changed, 92 insertions(+), 57 deletions(-)

diff --git a/fun_text_processing/text_normalization/normalize.py b/fun_text_processing/text_normalization/normalize.py
index 30d4941..c1cebdd 100644
--- a/fun_text_processing/text_normalization/normalize.py
+++ b/fun_text_processing/text_normalization/normalize.py
@@ -1,17 +1,3 @@
-# Copyright NeMo (https://github.com/NVIDIA/NeMo). All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import itertools
 import os
 import re
@@ -41,7 +27,7 @@
 except (ModuleNotFoundError, ImportError) as e:
     NLP_AVAILABLE = False
 
-SPACE_DUP = re.compile(' {2,}')
+SPACE_DUP = re.compile(" {2,}")
 
 
 class Normalizer:
@@ -62,7 +48,7 @@
     def __init__(
         self,
         input_case: str,
-        lang: str = 'en',
+        lang: str = "en",
         deterministic: bool = True,
         cache_dir: str = None,
         overwrite_cache: bool = False,
@@ -75,36 +61,62 @@
         self.post_processor = None
 
         if lang == "en":
-            from fun_text_processing.text_normalization.en.verbalizers.verbalize_final import VerbalizeFinalFst
-            from fun_text_processing.text_normalization.en.verbalizers.post_processing import PostProcessingFst
+            from fun_text_processing.text_normalization.en.verbalizers.verbalize_final import (
+                VerbalizeFinalFst,
+            )
+            from fun_text_processing.text_normalization.en.verbalizers.post_processing import (
+                PostProcessingFst,
+            )
 
             if post_process:
-                self.post_processor = PostProcessingFst(cache_dir=cache_dir, overwrite_cache=overwrite_cache)
+                self.post_processor = PostProcessingFst(
+                    cache_dir=cache_dir, overwrite_cache=overwrite_cache
+                )
 
             if deterministic:
-                from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify import ClassifyFst
+                from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify import (
+                    ClassifyFst,
+                )
             else:
                 if lm:
-                    from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_lm import ClassifyFst
+                    from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_lm import (
+                        ClassifyFst,
+                    )
                 else:
                     from fun_text_processing.text_normalization.en.taggers.tokenize_and_classify_with_audio import (
                         ClassifyFst,
                     )
 
-        elif lang == 'ru':
+        elif lang == "ru":
             # Ru TN only support non-deterministic cases and produces multiple normalization options
             # use normalize_with_audio.py
-            from fun_text_processing.text_normalization.ru.taggers.tokenize_and_classify import ClassifyFst
-            from fun_text_processing.text_normalization.ru.verbalizers.verbalize_final import VerbalizeFinalFst
-        elif lang == 'de':
-            from fun_text_processing.text_normalization.de.taggers.tokenize_and_classify import ClassifyFst
-            from fun_text_processing.text_normalization.de.verbalizers.verbalize_final import VerbalizeFinalFst
-        elif lang == 'es':
-            from fun_text_processing.text_normalization.es.taggers.tokenize_and_classify import ClassifyFst
-            from fun_text_processing.text_normalization.es.verbalizers.verbalize_final import VerbalizeFinalFst
-        elif lang == 'zh':
-            from fun_text_processing.text_normalization.zh.taggers.tokenize_and_classify import ClassifyFst
-            from fun_text_processing.text_normalization.zh.verbalizers.verbalize_final import VerbalizeFinalFst
+            from fun_text_processing.text_normalization.ru.taggers.tokenize_and_classify import (
+                ClassifyFst,
+            )
+            from fun_text_processing.text_normalization.ru.verbalizers.verbalize_final import (
+                VerbalizeFinalFst,
+            )
+        elif lang == "de":
+            from fun_text_processing.text_normalization.de.taggers.tokenize_and_classify import (
+                ClassifyFst,
+            )
+            from fun_text_processing.text_normalization.de.verbalizers.verbalize_final import (
+                VerbalizeFinalFst,
+            )
+        elif lang == "es":
+            from fun_text_processing.text_normalization.es.taggers.tokenize_and_classify import (
+                ClassifyFst,
+            )
+            from fun_text_processing.text_normalization.es.verbalizers.verbalize_final import (
+                VerbalizeFinalFst,
+            )
+        elif lang == "zh":
+            from fun_text_processing.text_normalization.zh.taggers.tokenize_and_classify import (
+                ClassifyFst,
+            )
+            from fun_text_processing.text_normalization.zh.verbalizers.verbalize_final import (
+                VerbalizeFinalFst,
+            )
         self.tagger = ClassifyFst(
             input_case=input_case,
             deterministic=deterministic,
@@ -156,7 +168,9 @@
 
         try:
             normalized_texts = Parallel(n_jobs=n_jobs)(
-                delayed(self.process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
+                delayed(self.process_batch)(
+                    texts[i : i + batch], verbose, punct_pre_process, punct_post_process
+                )
                 for i in range(0, len(texts), batch)
             )
         except BaseException as e:
@@ -176,7 +190,10 @@
         """
         normalized_lines = [
             self.normalize(
-                text, verbose=verbose, punct_pre_process=punct_pre_process, punct_post_process=punct_post_process
+                text,
+                verbose=verbose,
+                punct_pre_process=punct_pre_process,
+                punct_post_process=punct_post_process,
             )
             for text in tqdm(batch)
         ]
@@ -250,7 +267,11 @@
         return splits
 
     def normalize(
-        self, text: str, verbose: bool = False, punct_pre_process: bool = False, punct_post_process: bool = False
+        self,
+        text: str,
+        verbose: bool = False,
+        punct_pre_process: bool = False,
+        punct_post_process: bool = False,
     ) -> str:
         """
         Main function. Normalizes tokens from written to spoken form
@@ -298,10 +319,10 @@
                     break
             if verbalizer_lattice is None:
                 raise ValueError(f"No permutations were generated from tokens {s}")
-            output += ' ' + self.select_verbalizer(verbalizer_lattice)
-        output = SPACE_DUP.sub(' ', output[1:])
+            output += " " + self.select_verbalizer(verbalizer_lattice)
+        output = SPACE_DUP.sub(" ", output[1:])
 
-        if self.lang == "en" and hasattr(self, 'post_processor'):
+        if self.lang == "en" and hasattr(self, "post_processor"):
             output = self.post_process(output)
 
         if punct_post_process:
@@ -323,11 +344,11 @@
 
         Returns list of sentences
         """
-        lower_case_unicode = ''
-        upper_case_unicode = ''
+        lower_case_unicode = ""
+        upper_case_unicode = ""
         if self.lang == "ru":
-            lower_case_unicode = '\u0430-\u04FF'
-            upper_case_unicode = '\u0410-\u042F'
+            lower_case_unicode = "\u0430-\u04FF"
+            upper_case_unicode = "\u0410-\u042F"
 
         # Read and split transcript by utterance (roughly, sentences)
         split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.鈥潀\?鈥漒!鈥�)\s(?![0-9]+[a-z]*\.)"
@@ -353,10 +374,12 @@
             subl = [""]
             for k, v in perm:
                 if isinstance(v, str):
-                    subl = ["".join(x) for x in itertools.product(subl, [f"{k}: \"{v}\" "])]
+                    subl = ["".join(x) for x in itertools.product(subl, [f'{k}: "{v}" '])]
                 elif isinstance(v, OrderedDict):
                     rec = self._permute(v)
-                    subl = ["".join(x) for x in itertools.product(subl, [f" {k} {{ "], rec, [f" }} "])]
+                    subl = [
+                        "".join(x) for x in itertools.product(subl, [f" {k} {{ "], rec, [f" }} "])
+                    ]
                 elif isinstance(v, bool):
                     subl = ["".join(x) for x in itertools.product(subl, [f"{k}: true "])]
                 else:
@@ -394,7 +417,7 @@
 
         return _helper("", tokens, 0)
 
-    def find_tags(self, text: str) -> 'pynini.FstLike':
+    def find_tags(self, text: str) -> "pynini.FstLike":
         """
         Given text use tagger Fst to tag text
 
@@ -406,7 +429,7 @@
         lattice = text @ self.tagger.fst
         return lattice
 
-    def select_tag(self, lattice: 'pynini.FstLike') -> str:
+    def select_tag(self, lattice: "pynini.FstLike") -> str:
         """
         Given tagged lattice return shortest path
 
@@ -418,7 +441,7 @@
         tagged_text = pynini.shortestpath(lattice, nshortest=1, unique=True).string()
         return tagged_text
 
-    def find_verbalizer(self, tagged_text: str) -> 'pynini.FstLike':
+    def find_verbalizer(self, tagged_text: str) -> "pynini.FstLike":
         """
         Given tagged text creates verbalization lattice
         This is context-independent.
@@ -431,7 +454,7 @@
         lattice = tagged_text @ self.verbalizer.fst
         return lattice
 
-    def select_verbalizer(self, lattice: 'pynini.FstLike') -> str:
+    def select_verbalizer(self, lattice: "pynini.FstLike") -> str:
         """
         Given verbalized lattice return shortest path
 
@@ -445,7 +468,7 @@
         # output = pynini.shortestpath(lattice, nshortest=1, unique=True).string()
         return output
 
-    def post_process(self, normalized_text: 'pynini.FstLike') -> str:
+    def post_process(self, normalized_text: "pynini.FstLike") -> str:
         """
         Runs post processing graph on normalized text
 
@@ -469,22 +492,34 @@
     input = parser.add_mutually_exclusive_group()
     input.add_argument("--text", dest="input_string", help="input string", type=str)
     input.add_argument("--input_file", dest="input_file", help="input file path", type=str)
-    parser.add_argument('--output_file', dest="output_file", help="output file path", type=str)
-    parser.add_argument("--language", help="language", choices=["en", "de", "es", "zh"], default="en", type=str)
+    parser.add_argument("--output_file", dest="output_file", help="output file path", type=str)
     parser.add_argument(
-        "--input_case", help="input capitalization", choices=["lower_cased", "cased"], default="cased", type=str
+        "--language", help="language", choices=["en", "de", "es", "zh"], default="en", type=str
     )
-    parser.add_argument("--verbose", help="print info for debugging", action='store_true')
+    parser.add_argument(
+        "--input_case",
+        help="input capitalization",
+        choices=["lower_cased", "cased"],
+        default="cased",
+        type=str,
+    )
+    parser.add_argument("--verbose", help="print info for debugging", action="store_true")
     parser.add_argument(
         "--punct_post_process",
         help="set to True to enable punctuation post processing to match input.",
         action="store_true",
     )
     parser.add_argument(
-        "--punct_pre_process", help="set to True to enable punctuation pre processing", action="store_true"
+        "--punct_pre_process",
+        help="set to True to enable punctuation pre processing",
+        action="store_true",
     )
-    parser.add_argument("--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true")
-    parser.add_argument("--whitelist", help="path to a file with with whitelist", default=None, type=str)
+    parser.add_argument(
+        "--overwrite_cache", help="set to True to re-create .far grammar files", action="store_true"
+    )
+    parser.add_argument(
+        "--whitelist", help="path to a file with with whitelist", default=None, type=str
+    )
     parser.add_argument(
         "--cache_dir",
         help="path to a dir with .far grammar file. Set to None to avoid using cache",

--
Gitblit v1.9.1