From 835369d6315e96c1820326ed11ea4b999793720f Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期六, 13 一月 2024 22:42:18 +0800
Subject: [PATCH] funasr1.0 fix punc model
---
funasr/models/ct_transformer/utils.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++----------
1 files changed, 95 insertions(+), 20 deletions(-)
diff --git a/funasr/models/ct_transformer/utils.py b/funasr/models/ct_transformer/utils.py
index a4a00e0..917f2e0 100644
--- a/funasr/models/ct_transformer/utils.py
+++ b/funasr/models/ct_transformer/utils.py
@@ -1,4 +1,4 @@
-
+import re
def split_to_mini_sentence(words: list, word_limit: int = 20):
assert word_limit > 1
@@ -14,23 +14,98 @@
return sentences
-def split_words(text: str):
- words = []
- segs = text.split()
- for seg in segs:
- # There is no space in seg.
- current_word = ""
- for c in seg:
- if len(c.encode()) == 1:
- # This is an ASCII char.
- current_word += c
+# def split_words(text: str, **kwargs):
+# words = []
+# segs = text.split()
+# for seg in segs:
+# # There is no space in seg.
+# current_word = ""
+# for c in seg:
+# if len(c.encode()) == 1:
+# # This is an ASCII char.
+# current_word += c
+# else:
+# # This is a Chinese char.
+# if len(current_word) > 0:
+# words.append(current_word)
+# current_word = ""
+# words.append(c)
+# if len(current_word) > 0:
+# words.append(current_word)
+#
+# return words
+
+def split_words(text: str, jieba_usr_dict=None, **kwargs):
+ if jieba_usr_dict:
+ input_list = text.split()
+ token_list_all = []
+ langauge_list = []
+ token_list_tmp = []
+ language_flag = None
+ for token in input_list:
+ if isEnglish(token) and language_flag == 'Chinese':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('Chinese')
+ token_list_tmp = []
+ elif not isEnglish(token) and language_flag == 'English':
+ token_list_all.append(token_list_tmp)
+ langauge_list.append('English')
+ token_list_tmp = []
+
+ token_list_tmp.append(token)
+
+ if isEnglish(token):
+ language_flag = 'English'
else:
- # This is a Chinese char.
- if len(current_word) > 0:
- words.append(current_word)
- current_word = ""
- words.append(c)
- if len(current_word) > 0:
- words.append(current_word)
-
- return words
+ language_flag = 'Chinese'
+
+ if token_list_tmp:
+ token_list_all.append(token_list_tmp)
+ langauge_list.append(language_flag)
+
+ result_list = []
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+ if language_flag == 'English':
+ result_list.extend(token_list_tmp)
+ else:
+ seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+ result_list.extend(seg_list)
+
+ return result_list
+
+ else:
+ words = []
+ segs = text.split()
+ for seg in segs:
+ # There is no space in seg.
+ current_word = ""
+ for c in seg:
+ if len(c.encode()) == 1:
+ # This is an ASCII char.
+ current_word += c
+ else:
+ # This is a Chinese char.
+ if len(current_word) > 0:
+ words.append(current_word)
+ current_word = ""
+ words.append(c)
+ if len(current_word) > 0:
+ words.append(current_word)
+ return words
+
+def isEnglish(text:str):
+ if re.search('^[a-zA-Z\']+$', text):
+ return True
+ else:
+ return False
+
+def join_chinese_and_english(input_list):
+ line = ''
+ for token in input_list:
+ if isEnglish(token):
+ line = line + ' ' + token
+ else:
+ line = line + token
+
+ line = line.strip()
+ return line
--
Gitblit v1.9.1