北念
2024-07-10 5448e926a215066193f8c5a12e0c7dfe55c29579
add postprocess for sensevoice
2个文件已修改
117 ■■■■■ 已修改文件
examples/industrial_data_pretraining/sense_voice/demo.py 7 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/utils/postprocess_utils.py 110 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
examples/industrial_data_pretraining/sense_voice/demo.py
@@ -5,11 +5,14 @@
import sys
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
model_dir = "iic/SenseVoiceSmall"
input_file = (
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
)
input_file = ("/nfs/beinian.lzr/workspace/models/modelscope_models/SenseVoice/example/rich_2.wav")
model = AutoModel(
    model=model_dir,
@@ -22,4 +25,6 @@
    use_itn=False,
)
print(res)
text = rich_transcription_postprocess(res[0]["text"])
print(text)
funasr/utils/postprocess_utils.py
@@ -299,3 +299,113 @@
            real_word_lists.append(ch)
    sentence = "".join(word_lists)
    return sentence, real_word_lists
emo_dict = {
        "<|HAPPY|>": "😊",
        "<|SAD|>": "😔",
        "<|ANGRY|>": "😡",
        "<|NEUTRAL|>": "",
        "<|FEARFUL|>": "😰",
        "<|DISGUSTED|>": "🤢",
        "<|SURPRISED|>": "😮",
}
event_dict = {
        "<|BGM|>": "🎼",
        "<|Speech|>": "",
        "<|Applause|>": "👏",
        "<|Laughter|>": "😀",
        "<|Cry|>": "😭",
        "<|Sneeze|>": "🤧",
        "<|Breath|>": "",
        "<|Cough|>": "🤧",
}
lang_dict =  {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}
emoji_dict = {
        "<|nospeech|><|Event_UNK|>": "❓",
        "<|zh|>": "",
        "<|en|>": "",
        "<|yue|>": "",
        "<|ja|>": "",
        "<|ko|>": "",
        "<|nospeech|>": "",
        "<|HAPPY|>": "😊",
        "<|SAD|>": "😔",
        "<|ANGRY|>": "😡",
        "<|NEUTRAL|>": "",
        "<|BGM|>": "🎼",
        "<|Speech|>": "",
        "<|Applause|>": "👏",
        "<|Laughter|>": "😀",
        "<|FEARFUL|>": "😰",
        "<|DISGUSTED|>": "🤢",
        "<|SURPRISED|>": "😮",
        "<|Cry|>": "😭",
        "<|EMO_UNKNOWN|>": "",
        "<|Sneeze|>": "🤧",
        "<|Breath|>": "",
        "<|Cough|>": "😷",
        "<|Sing|>": "",
        "<|Speech_Noise|>": "",
        "<|withitn|>": "",
        "<|woitn|>": "",
        "<|GBG|>": "",
        "<|Event_UNK|>": "",
}
emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
def format_str_v2(s):
        sptk_dict = {}
        for sptk in emoji_dict:
                sptk_dict[sptk] = s.count(sptk)
                s = s.replace(sptk, "")
        emo = "<|NEUTRAL|>"
        for e in emo_dict:
                if sptk_dict[e] > sptk_dict[emo]:
                        emo = e
        for e in event_dict:
                if sptk_dict[e] > 0:
                        s = event_dict[e] + s
        s = s + emo_dict[emo]
        for emoji in emo_set.union(event_set):
                s = s.replace(" " + emoji, emoji)
                s = s.replace(emoji + " ", emoji)
        return s.strip()
def rich_transcription_postprocess(s):
        def get_emo(s):
                return s[-1] if s[-1] in emo_set else None
        def get_event(s):
                return s[0] if s[0] in event_set else None
        s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
        for lang in lang_dict:
                s = s.replace(lang, "<|lang|>")
        s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
        new_s = " " + s_list[0]
        cur_ent_event = get_event(new_s)
        for i in range(1, len(s_list)):
                if len(s_list[i]) == 0:
                        continue
                if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
                        s_list[i] = s_list[i][1:]
                #else:
                cur_ent_event = get_event(s_list[i])
                if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
                        new_s = new_s[:-1]
                new_s += s_list[i].strip().lstrip()
        new_s = new_s.replace("The.", " ")
        return new_s.strip()