From d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99 Mon Sep 17 00:00:00 2001
From: liugz18 <57401541+liugz18@users.noreply.github.com>
Date: 星期四, 18 七月 2024 21:34:55 +0800
Subject: [PATCH] Rename 'res' in line 514 to avoid with naming conflict with line 365

---
 funasr/models/ct_transformer/utils.py |   88 +++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/funasr/models/ct_transformer/utils.py b/funasr/models/ct_transformer/utils.py
index 0291dbc..b6e11e7 100644
--- a/funasr/models/ct_transformer/utils.py
+++ b/funasr/models/ct_transformer/utils.py
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
+import re
 
 
 def split_to_mini_sentence(words: list, word_limit: int = 20):
@@ -8,7 +14,85 @@
     length = len(words)
     sentence_len = length // word_limit
     for i in range(sentence_len):
-        sentences.append(words[i * word_limit:(i + 1) * word_limit])
+        sentences.append(words[i * word_limit : (i + 1) * word_limit])
     if length % word_limit > 0:
-        sentences.append(words[sentence_len * word_limit:])
+        sentences.append(words[sentence_len * word_limit :])
     return sentences
+
+
+def split_words(text: str, jieba_usr_dict=None, **kwargs):
+    if jieba_usr_dict:
+        input_list = text.split()
+        token_list_all = []
+        langauge_list = []
+        token_list_tmp = []
+        language_flag = None
+        for token in input_list:
+            if isEnglish(token) and language_flag == "Chinese":
+                token_list_all.append(token_list_tmp)
+                langauge_list.append("Chinese")
+                token_list_tmp = []
+            elif not isEnglish(token) and language_flag == "English":
+                token_list_all.append(token_list_tmp)
+                langauge_list.append("English")
+                token_list_tmp = []
+
+            token_list_tmp.append(token)
+
+            if isEnglish(token):
+                language_flag = "English"
+            else:
+                language_flag = "Chinese"
+
+        if token_list_tmp:
+            token_list_all.append(token_list_tmp)
+            langauge_list.append(language_flag)
+
+        result_list = []
+        for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+            if language_flag == "English":
+                result_list.extend(token_list_tmp)
+            else:
+                seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+                result_list.extend(seg_list)
+
+        return result_list
+
+    else:
+        words = []
+        segs = text.split()
+        for seg in segs:
+            # There is no space in seg.
+            current_word = ""
+            for c in seg:
+                if len(c.encode()) == 1:
+                    # This is an ASCII char.
+                    current_word += c
+                else:
+                    # This is a Chinese char.
+                    if len(current_word) > 0:
+                        words.append(current_word)
+                        current_word = ""
+                    words.append(c)
+            if len(current_word) > 0:
+                words.append(current_word)
+        return words
+
+
+def isEnglish(text: str):
+    if re.search("^[a-zA-Z']+$", text):
+        return True
+    else:
+        return False
+
+
+def join_chinese_and_english(input_list):
+    line = ""
+    for token in input_list:
+        if isEnglish(token):
+            line = line + " " + token
+        else:
+            line = line + token
+
+    line = line.strip()
+    return line

--
Gitblit v1.9.1