Merge branch 'main' of https://github.com/alibaba-damo-academy/FunASR into main
| | |
| | | if time_stamp is not None: |
| | | end = time_stamp[ts_nums[num]][1] |
| | | ts_lists.append([begin, end]) |
| | | else: |
| | | else: |
| | | word_lists.append(words[num]) |
| | | # length of time_stamp may not equal to length of words because of the (somehow improper) threshold set in timestamp_tools.py line 46, e.g., length of time_stamp can be zero but length of words is not. |
| | | # Moreover, move "word_lists.append(words[num])" into if clause, to keep length of word_lists and length of ts_lists equal. |
| | | if time_stamp is not None and ts_nums[num]<len(time_stamp) and words[num] != " ": |
| | | word_lists.append(words[num]) |
| | | if time_stamp is not None and ts_nums[num] < len(time_stamp) and words[num] != " ": |
| | | begin = time_stamp[ts_nums[num]][0] |
| | | end = time_stamp[ts_nums[num]][1] |
| | | ts_lists.append([begin, end]) |
| | |
| | | sentence = "".join(word_lists) |
| | | return sentence, real_word_lists |
| | | |
| | | |
| | | emo_dict = { |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | } |
| | | |
| | | event_dict = { |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|Cry|>": "😭", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "🤧", |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|Cry|>": "😭", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "🤧", |
| | | } |
| | | |
| | | lang_dict = { |
| | | lang_dict = { |
| | | "<|zh|>": "<|lang|>", |
| | | "<|en|>": "<|lang|>", |
| | | "<|yue|>": "<|lang|>", |
| | |
| | | } |
| | | |
| | | emoji_dict = { |
| | | "<|nospeech|><|Event_UNK|>": "❓", |
| | | "<|zh|>": "", |
| | | "<|en|>": "", |
| | | "<|yue|>": "", |
| | | "<|ja|>": "", |
| | | "<|ko|>": "", |
| | | "<|nospeech|>": "", |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | "<|Cry|>": "😭", |
| | | "<|EMO_UNKNOWN|>": "", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "😷", |
| | | "<|Sing|>": "", |
| | | "<|Speech_Noise|>": "", |
| | | "<|withitn|>": "", |
| | | "<|woitn|>": "", |
| | | "<|GBG|>": "", |
| | | "<|Event_UNK|>": "", |
| | | "<|nospeech|><|Event_UNK|>": "❓", |
| | | "<|zh|>": "", |
| | | "<|en|>": "", |
| | | "<|yue|>": "", |
| | | "<|ja|>": "", |
| | | "<|ko|>": "", |
| | | "<|nospeech|>": "", |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | "<|Cry|>": "😭", |
| | | "<|EMO_UNKNOWN|>": "", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "😷", |
| | | "<|Sing|>": "", |
| | | "<|Speech_Noise|>": "", |
| | | "<|withitn|>": "", |
| | | "<|woitn|>": "", |
| | | "<|GBG|>": "", |
| | | "<|Event_UNK|>": "", |
| | | } |
| | | |
| | | emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"} |
| | | event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",} |
| | | event_set = { |
| | | "🎼", |
| | | "👏", |
| | | "😀", |
| | | "😭", |
| | | "🤧", |
| | | "😷", |
| | | } |
| | | |
| | | |
| | | def format_str_v2(s): |
| | | sptk_dict = {} |
| | | for sptk in emoji_dict: |
| | | sptk_dict[sptk] = s.count(sptk) |
| | | s = s.replace(sptk, "") |
| | | emo = "<|NEUTRAL|>" |
| | | for e in emo_dict: |
| | | if sptk_dict[e] > sptk_dict[emo]: |
| | | emo = e |
| | | for e in event_dict: |
| | | if sptk_dict[e] > 0: |
| | | s = event_dict[e] + s |
| | | s = s + emo_dict[emo] |
| | | sptk_dict = {} |
| | | for sptk in emoji_dict: |
| | | sptk_dict[sptk] = s.count(sptk) |
| | | s = s.replace(sptk, "") |
| | | emo = "<|NEUTRAL|>" |
| | | for e in emo_dict: |
| | | if sptk_dict[e] > sptk_dict[emo]: |
| | | emo = e |
| | | for e in event_dict: |
| | | if sptk_dict[e] > 0: |
| | | s = event_dict[e] + s |
| | | s = s + emo_dict[emo] |
| | | |
| | | for emoji in emo_set.union(event_set): |
| | | s = s.replace(" " + emoji, emoji) |
| | | s = s.replace(emoji + " ", emoji) |
| | | return s.strip() |
| | | for emoji in emo_set.union(event_set): |
| | | s = s.replace(" " + emoji, emoji) |
| | | s = s.replace(emoji + " ", emoji) |
| | | return s.strip() |
| | | |
| | | |
| | | def rich_transcription_postprocess(s): |
| | | def get_emo(s): |
| | | return s[-1] if s[-1] in emo_set else None |
| | | def get_event(s): |
| | | return s[0] if s[0] in event_set else None |
| | | def get_emo(s): |
| | | return s[-1] if s[-1] in emo_set else None |
| | | |
| | | s = s.replace("<|nospeech|><|Event_UNK|>", "❓") |
| | | for lang in lang_dict: |
| | | s = s.replace(lang, "<|lang|>") |
| | | s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] |
| | | new_s = " " + s_list[0] |
| | | cur_ent_event = get_event(new_s) |
| | | for i in range(1, len(s_list)): |
| | | if len(s_list[i]) == 0: |
| | | continue |
| | | if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: |
| | | s_list[i] = s_list[i][1:] |
| | | #else: |
| | | cur_ent_event = get_event(s_list[i]) |
| | | if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): |
| | | new_s = new_s[:-1] |
| | | new_s += s_list[i].strip().lstrip() |
| | | new_s = new_s.replace("The.", " ") |
| | | return new_s.strip() |
| | | |
| | | def get_event(s): |
| | | return s[0] if s[0] in event_set else None |
| | | |
| | | s = s.replace("<|nospeech|><|Event_UNK|>", "❓") |
| | | for lang in lang_dict: |
| | | s = s.replace(lang, "<|lang|>") |
| | | s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] |
| | | new_s = " " + s_list[0] |
| | | cur_ent_event = get_event(new_s) |
| | | for i in range(1, len(s_list)): |
| | | if len(s_list[i]) == 0: |
| | | continue |
| | | if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: |
| | | s_list[i] = s_list[i][1:] |
| | | # else: |
| | | cur_ent_event = get_event(s_list[i]) |
| | | if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): |
| | | new_s = new_s[:-1] |
| | | new_s += s_list[i].strip().lstrip() |
| | | new_s = new_s.replace("The.", " ") |
| | | return new_s.strip() |