游雁
2024-02-20 d79287c37e4e7ae2694a992cbbfb03a5ca4f7670
funasr/models/ct_transformer/utils.py
@@ -1,4 +1,10 @@
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
#  MIT License  (https://opensource.org/licenses/MIT)
import re
def split_to_mini_sentence(words: list, word_limit: int = 20):
    assert word_limit > 1
@@ -12,28 +18,6 @@
    if length % word_limit > 0:
        sentences.append(words[sentence_len * word_limit:])
    return sentences
# def split_words(text: str, **kwargs):
#     words = []
#     segs = text.split()
#     for seg in segs:
#         # There is no space in seg.
#         current_word = ""
#         for c in seg:
#             if len(c.encode()) == 1:
#                 # This is an ASCII char.
#                 current_word += c
#             else:
#                 # This is a Chinese char.
#                 if len(current_word) > 0:
#                     words.append(current_word)
#                     current_word = ""
#                 words.append(c)
#         if len(current_word) > 0:
#             words.append(current_word)
#
#     return words
def split_words(text: str, jieba_usr_dict=None, **kwargs):
    if jieba_usr_dict: