def split_words(text: str):
|
words = []
|
segs = text.split()
|
for seg in segs:
|
# There is no space in seg.
|
current_word = ""
|
for c in seg:
|
if len(c.encode()) == 1:
|
# This is an ASCII char.
|
current_word += c
|
else:
|
# This is a Chinese char.
|
if len(current_word) > 0:
|
words.append(current_word)
|
current_word = ""
|
words.append(c)
|
if len(current_word) > 0:
|
words.append(current_word)
|
return words
|
|
|
def split_to_mini_sentence(words: list, word_limit: int = 20):
|
assert word_limit > 1
|
if len(words) <= word_limit:
|
return [words]
|
sentences = []
|
length = len(words)
|
sentence_len = length // word_limit
|
for i in range(sentence_len):
|
sentences.append(words[i * word_limit:(i + 1) * word_limit])
|
if length % word_limit > 0:
|
sentences.append(words[sentence_len * word_limit:])
|
return sentences
|