北念
2023-10-10 a4de8b2a0a69ba42c58d6bacb9c9108539a1e280
update asr postprocess_utils
2个文件已修改
63 ■■■■■ 已修改文件
funasr/bin/asr_inference_launch.py 10 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/utils/postprocess_utils.py 53 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_launch.py
@@ -488,6 +488,7 @@
):
    ncpu = kwargs.get("ncpu", 1)
    torch.set_num_threads(ncpu)
    language = kwargs.get("model_lang", None)
    if word_lm_train_config is not None:
        raise NotImplementedError("Word LM is not implemented")
@@ -694,10 +695,13 @@
            text, token, token_int = result[0], result[1], result[2]
            time_stamp = result[4] if len(result[4]) > 0 else None
            if use_timestamp and time_stamp is not None and len(time_stamp):
                postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
            if language == "en-bpe":
                postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
            else:
                postprocessed_result = postprocess_utils.sentence_postprocess(token)
                if use_timestamp and time_stamp is not None and len(time_stamp):
                    postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                else:
                    postprocessed_result = postprocess_utils.sentence_postprocess(token)
            text_postprocessed = ""
            time_stamp_postprocessed = ""
            text_postprocessed_punc = postprocessed_result
funasr/utils/postprocess_utils.py
@@ -242,4 +242,55 @@
            if ch != ' ':
                real_word_lists.append(ch)
        sentence = ''.join(word_lists).strip()
        return sentence, real_word_lists
        return sentence, real_word_lists
def sentence_postprocess_sentencepiece(words):
    middle_lists = []
    word_lists = []
    word_item = ''
    # wash words lists
    for i in words:
        word = ''
        if isinstance(i, str):
            word = i
        else:
            word = i.decode('utf-8')
        if word in ['<s>', '</s>', '<unk>', '<OOV>']:
            continue
        else:
            middle_lists.append(word)
    # all alpha characters
    for i, ch in enumerate(middle_lists):
        word = ''
        if '\u2581' in ch and i == 0:
            word_item = ''
            word = ch.replace('\u2581', '')
            word_item += word
        elif '\u2581' in ch and i != 0:
            word_lists.append(word_item)
            word_lists.append(' ')
            word_item = ''
            word = ch.replace('\u2581', '')
            word_item += word
        else:
            word_item += ch
    if word_item is not None:
        word_lists.append(word_item)
    #word_lists = abbr_dispose(word_lists)
    real_word_lists = []
    for ch in word_lists:
        if ch != ' ':
            if ch == "i":
                ch = ch.replace("i", "I")
            elif ch == "i'm":
                ch = ch.replace("i'm", "I'm")
            elif ch == "i've":
                ch = ch.replace("i've", "I've")
            elif ch == "i'll":
                ch = ch.replace("i'll", "I'll")
            real_word_lists.append(ch)
    sentence = ''.join(word_lists)
    return sentence, real_word_lists