funasr/models/ct_transformer/utils.py
@@ -12,3 +12,25 @@ if length % word_limit > 0: sentences.append(words[sentence_len * word_limit:]) return sentences def split_words(text: str): words = [] segs = text.split() for seg in segs: # There is no space in seg. current_word = "" for c in seg: if len(c.encode()) == 1: # This is an ASCII char. current_word += c else: # This is a Chinese char. if len(current_word) > 0: words.append(current_word) current_word = "" words.append(c) if len(current_word) > 0: words.append(current_word) return words