From d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99 Mon Sep 17 00:00:00 2001
From: liugz18 <57401541+liugz18@users.noreply.github.com>
Date: 星期四, 18 七月 2024 21:34:55 +0800
Subject: [PATCH] Rename 'res' in line 514 to avoid with naming conflict with line 365
---
funasr/models/ct_transformer/utils.py | 88 +++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/funasr/models/ct_transformer/utils.py b/funasr/models/ct_transformer/utils.py
index 0291dbc..b6e11e7 100644
--- a/funasr/models/ct_transformer/utils.py
+++ b/funasr/models/ct_transformer/utils.py
@@ -1,3 +1,9 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
+import re
def split_to_mini_sentence(words: list, word_limit: int = 20):
@@ -8,7 +14,85 @@
length = len(words)
sentence_len = length // word_limit
for i in range(sentence_len):
- sentences.append(words[i * word_limit:(i + 1) * word_limit])
+ sentences.append(words[i * word_limit : (i + 1) * word_limit])
if length % word_limit > 0:
- sentences.append(words[sentence_len * word_limit:])
+ sentences.append(words[sentence_len * word_limit :])
return sentences
+
+
+def split_words(text: str, jieba_usr_dict=None, **kwargs):
+ if jieba_usr_dict:
+ input_list = text.split()
+ token_list_all = []
+ langauge_list = []
+ token_list_tmp = []
+ language_flag = None
+ for token in input_list:
+ if isEnglish(token) and language_flag == "Chinese":
+ token_list_all.append(token_list_tmp)
+ langauge_list.append("Chinese")
+ token_list_tmp = []
+ elif not isEnglish(token) and language_flag == "English":
+ token_list_all.append(token_list_tmp)
+ langauge_list.append("English")
+ token_list_tmp = []
+
+ token_list_tmp.append(token)
+
+ if isEnglish(token):
+ language_flag = "English"
+ else:
+ language_flag = "Chinese"
+
+ if token_list_tmp:
+ token_list_all.append(token_list_tmp)
+ langauge_list.append(language_flag)
+
+ result_list = []
+ for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+ if language_flag == "English":
+ result_list.extend(token_list_tmp)
+ else:
+ seg_list = jieba_usr_dict.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+ result_list.extend(seg_list)
+
+ return result_list
+
+ else:
+ words = []
+ segs = text.split()
+ for seg in segs:
+ # There is no space in seg.
+ current_word = ""
+ for c in seg:
+ if len(c.encode()) == 1:
+ # This is an ASCII char.
+ current_word += c
+ else:
+ # This is a Chinese char.
+ if len(current_word) > 0:
+ words.append(current_word)
+ current_word = ""
+ words.append(c)
+ if len(current_word) > 0:
+ words.append(current_word)
+ return words
+
+
+def isEnglish(text: str):
+ if re.search("^[a-zA-Z']+$", text):
+ return True
+ else:
+ return False
+
+
+def join_chinese_and_english(input_list):
+ line = ""
+ for token in input_list:
+ if isEnglish(token):
+ line = line + " " + token
+ else:
+ line = line + token
+
+ line = line.strip()
+ return line
--
Gitblit v1.9.1