| | |
| | | |
| | | |
| | | def isChinese(ch: str): |
| | | if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039': |
| | | if "\u4e00" <= ch <= "\u9fff" or "\u0030" <= ch <= "\u0039" or ch == "@": |
| | | return True |
| | | return False |
| | | |
| | |
| | | def isAllChinese(word: Union[List[Any], str]): |
| | | word_lists = [] |
| | | for i in word: |
| | | cur = i.replace(' ', '') |
| | | cur = cur.replace('</s>', '') |
| | | cur = cur.replace('<s>', '') |
| | | cur = i.replace(" ", "") |
| | | cur = cur.replace("</s>", "") |
| | | cur = cur.replace("<s>", "") |
| | | cur = cur.replace("<unk>", "") |
| | | cur = cur.replace("<OOV>", "") |
| | | word_lists.append(cur) |
| | | |
| | | if len(word_lists) == 0: |
| | |
| | | def isAllAlpha(word: Union[List[Any], str]): |
| | | word_lists = [] |
| | | for i in word: |
| | | cur = i.replace(' ', '') |
| | | cur = cur.replace('</s>', '') |
| | | cur = cur.replace('<s>', '') |
| | | cur = i.replace(" ", "") |
| | | cur = cur.replace("</s>", "") |
| | | cur = cur.replace("<s>", "") |
| | | cur = cur.replace("<unk>", "") |
| | | cur = cur.replace("<OOV>", "") |
| | | word_lists.append(cur) |
| | | |
| | | if len(word_lists) == 0: |
| | |
| | | if num <= last_num: |
| | | continue |
| | | |
| | | if len(words[num]) == 1 and words[num].encode('utf-8').isalpha(): |
| | | if num + 1 < words_size and words[ |
| | | num + 1] == ' ' and num + 2 < words_size and len( |
| | | words[num + |
| | | 2]) == 1 and words[num + |
| | | 2].encode('utf-8').isalpha(): |
| | | if len(words[num]) == 1 and words[num].encode("utf-8").isalpha(): |
| | | if ( |
| | | num + 1 < words_size |
| | | and words[num + 1] == " " |
| | | and num + 2 < words_size |
| | | and len(words[num + 2]) == 1 |
| | | and words[num + 2].encode("utf-8").isalpha() |
| | | ): |
| | | # found the begin of abbr |
| | | abbr_begin.append(num) |
| | | num += 2 |
| | |
| | | # to find the end of abbr |
| | | while True: |
| | | num += 1 |
| | | if num < words_size and words[num] == ' ': |
| | | if num < words_size and words[num] == " ": |
| | | num += 1 |
| | | if num < words_size and len( |
| | | words[num]) == 1 and words[num].encode( |
| | | 'utf-8').isalpha(): |
| | | if ( |
| | | num < words_size |
| | | and len(words[num]) == 1 |
| | | and words[num].encode("utf-8").isalpha() |
| | | ): |
| | | abbr_end.pop() |
| | | abbr_end.append(num) |
| | | last_num = num |
| | |
| | | break |
| | | |
| | | for num in range(words_size): |
| | | if words[num] == ' ': |
| | | if words[num] == " ": |
| | | ts_nums.append(ts_index) |
| | | else: |
| | | ts_nums.append(ts_index) |
| | | ts_index += 1 |
| | | ts_index += 1 |
| | | last_num = -1 |
| | | for num in range(words_size): |
| | | if num <= last_num: |
| | |
| | | if num in abbr_begin: |
| | | if time_stamp is not None: |
| | | begin = time_stamp[ts_nums[num]][0] |
| | | word_lists.append(words[num].upper()) |
| | | abbr_word = words[num].upper() |
| | | num += 1 |
| | | while num < words_size: |
| | | if num in abbr_end: |
| | | word_lists.append(words[num].upper()) |
| | | abbr_word += words[num].upper() |
| | | last_num = num |
| | | break |
| | | else: |
| | | if words[num].encode('utf-8').isalpha(): |
| | | word_lists.append(words[num].upper()) |
| | | if words[num].encode("utf-8").isalpha(): |
| | | abbr_word += words[num].upper() |
| | | num += 1 |
| | | if time_stamp is not None: |
| | | word_lists.append(abbr_word) |
| | | if time_stamp is not None and ts_nums[num] < len(time_stamp): |
| | | end = time_stamp[ts_nums[num]][1] |
| | | ts_lists.append([begin, end]) |
| | | else: |
| | | word_lists.append(words[num]) |
| | | if time_stamp is not None and words[num] != ' ': |
| | | # length of time_stamp may not equal to length of words because of the (somehow improper) threshold set in timestamp_tools.py line 46, e.g., length of time_stamp can be zero but length of words is not. |
| | | # Moreover, move "word_lists.append(words[num])" into if clause, to keep length of word_lists and length of ts_lists equal. |
| | | if time_stamp is not None and ts_nums[num] < len(time_stamp) and words[num] != " ": |
| | | begin = time_stamp[ts_nums[num]][0] |
| | | end = time_stamp[ts_nums[num]][1] |
| | | ts_lists.append([begin, end]) |
| | |
| | | def sentence_postprocess(words: List[Any], time_stamp: List[List] = None): |
| | | middle_lists = [] |
| | | word_lists = [] |
| | | word_item = '' |
| | | word_item = "" |
| | | ts_lists = [] |
| | | |
| | | # wash words lists |
| | | for i in words: |
| | | word = '' |
| | | word = "" |
| | | if isinstance(i, str): |
| | | word = i |
| | | else: |
| | | word = i.decode('utf-8') |
| | | word = i.decode("utf-8") |
| | | |
| | | if word in ['<s>', '</s>', '<unk>']: |
| | | if word in ["<s>", "</s>", "<unk>", "<OOV>"]: |
| | | continue |
| | | else: |
| | | middle_lists.append(word) |
| | |
| | | # all chinese characters |
| | | if isAllChinese(middle_lists): |
| | | for i, ch in enumerate(middle_lists): |
| | | word_lists.append(ch.replace(' ', '')) |
| | | word_lists.append(ch.replace(" ", "")) |
| | | if time_stamp is not None: |
| | | ts_lists = time_stamp |
| | | |
| | |
| | | if ts_flag and time_stamp is not None: |
| | | begin = time_stamp[i][0] |
| | | end = time_stamp[i][1] |
| | | word = '' |
| | | if '@@' in ch: |
| | | word = ch.replace('@@', '') |
| | | word = "" |
| | | if "@@" in ch: |
| | | word = ch.replace("@@", "") |
| | | word_item += word |
| | | if time_stamp is not None: |
| | | ts_flag = False |
| | |
| | | else: |
| | | word_item += ch |
| | | word_lists.append(word_item) |
| | | word_lists.append(' ') |
| | | word_item = '' |
| | | word_lists.append(" ") |
| | | word_item = "" |
| | | if time_stamp is not None: |
| | | ts_flag = True |
| | | end = time_stamp[i][1] |
| | |
| | | if ts_flag and time_stamp is not None: |
| | | begin = time_stamp[i][0] |
| | | end = time_stamp[i][1] |
| | | word = '' |
| | | word = "" |
| | | if isAllChinese(ch): |
| | | if alpha_blank is True: |
| | | word_lists.pop() |
| | |
| | | ts_flag = True |
| | | ts_lists.append([begin, end]) |
| | | begin = end |
| | | elif '@@' in ch: |
| | | word = ch.replace('@@', '') |
| | | elif "@@" in ch: |
| | | word = ch.replace("@@", "") |
| | | word_item += word |
| | | alpha_blank = False |
| | | if time_stamp is not None: |
| | |
| | | elif isAllAlpha(ch): |
| | | word_item += ch |
| | | word_lists.append(word_item) |
| | | word_lists.append(' ') |
| | | word_item = '' |
| | | word_lists.append(" ") |
| | | word_item = "" |
| | | alpha_blank = True |
| | | if time_stamp is not None: |
| | | ts_flag = True |
| | | end = time_stamp[i][1] |
| | | end = time_stamp[i][1] |
| | | ts_lists.append([begin, end]) |
| | | begin = end |
| | | else: |
| | | raise ValueError('invalid character: {}'.format(ch)) |
| | | word_lists.append(ch) |
| | | |
| | | if time_stamp is not None: |
| | | if time_stamp is not None: |
| | | word_lists, ts_lists = abbr_dispose(word_lists, ts_lists) |
| | | real_word_lists = [] |
| | | for ch in word_lists: |
| | | if ch != ' ': |
| | | if ch != " ": |
| | | real_word_lists.append(ch) |
| | | sentence = ' '.join(real_word_lists).strip() |
| | | sentence = " ".join(real_word_lists).strip() |
| | | return sentence, ts_lists, real_word_lists |
| | | else: |
| | | word_lists = abbr_dispose(word_lists) |
| | | sentence = ''.join(word_lists).strip() |
| | | return sentence |
| | | real_word_lists = [] |
| | | for ch in word_lists: |
| | | if ch != " ": |
| | | real_word_lists.append(ch) |
| | | sentence = "".join(word_lists).strip() |
| | | return sentence, real_word_lists |
| | | |
| | | |
| | | def sentence_postprocess_sentencepiece(words): |
| | | middle_lists = [] |
| | | word_lists = [] |
| | | word_item = "" |
| | | |
| | | # wash words lists |
| | | for i in words: |
| | | word = "" |
| | | if isinstance(i, str): |
| | | word = i |
| | | else: |
| | | word = i.decode("utf-8") |
| | | |
| | | if word in ["<s>", "</s>", "<unk>", "<OOV>"]: |
| | | continue |
| | | else: |
| | | middle_lists.append(word) |
| | | |
| | | # all alpha characters |
| | | for i, ch in enumerate(middle_lists): |
| | | word = "" |
| | | if "\u2581" in ch and i == 0: |
| | | word_item = "" |
| | | word = ch.replace("\u2581", "") |
| | | word_item += word |
| | | elif "\u2581" in ch and i != 0: |
| | | word_lists.append(word_item) |
| | | word_lists.append(" ") |
| | | word_item = "" |
| | | word = ch.replace("\u2581", "") |
| | | word_item += word |
| | | else: |
| | | word_item += ch |
| | | if word_item is not None: |
| | | word_lists.append(word_item) |
| | | # word_lists = abbr_dispose(word_lists) |
| | | real_word_lists = [] |
| | | for ch in word_lists: |
| | | if ch != " ": |
| | | if ch == "i": |
| | | ch = ch.replace("i", "I") |
| | | elif ch == "i'm": |
| | | ch = ch.replace("i'm", "I'm") |
| | | elif ch == "i've": |
| | | ch = ch.replace("i've", "I've") |
| | | elif ch == "i'll": |
| | | ch = ch.replace("i'll", "I'll") |
| | | real_word_lists.append(ch) |
| | | sentence = "".join(word_lists) |
| | | return sentence, real_word_lists |
| | | |
| | | |
| | | emo_dict = { |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | } |
| | | |
| | | event_dict = { |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|Cry|>": "😭", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "🤧", |
| | | } |
| | | |
| | | lang_dict = { |
| | | "<|zh|>": "<|lang|>", |
| | | "<|en|>": "<|lang|>", |
| | | "<|yue|>": "<|lang|>", |
| | | "<|ja|>": "<|lang|>", |
| | | "<|ko|>": "<|lang|>", |
| | | "<|nospeech|>": "<|lang|>", |
| | | } |
| | | |
| | | emoji_dict = { |
| | | "<|nospeech|><|Event_UNK|>": "❓", |
| | | "<|zh|>": "", |
| | | "<|en|>": "", |
| | | "<|yue|>": "", |
| | | "<|ja|>": "", |
| | | "<|ko|>": "", |
| | | "<|nospeech|>": "", |
| | | "<|HAPPY|>": "😊", |
| | | "<|SAD|>": "😔", |
| | | "<|ANGRY|>": "😡", |
| | | "<|NEUTRAL|>": "", |
| | | "<|BGM|>": "🎼", |
| | | "<|Speech|>": "", |
| | | "<|Applause|>": "👏", |
| | | "<|Laughter|>": "😀", |
| | | "<|FEARFUL|>": "😰", |
| | | "<|DISGUSTED|>": "🤢", |
| | | "<|SURPRISED|>": "😮", |
| | | "<|Cry|>": "😭", |
| | | "<|EMO_UNKNOWN|>": "", |
| | | "<|Sneeze|>": "🤧", |
| | | "<|Breath|>": "", |
| | | "<|Cough|>": "😷", |
| | | "<|Sing|>": "", |
| | | "<|Speech_Noise|>": "", |
| | | "<|withitn|>": "", |
| | | "<|woitn|>": "", |
| | | "<|GBG|>": "", |
| | | "<|Event_UNK|>": "", |
| | | } |
| | | |
| | | emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"} |
| | | event_set = { |
| | | "🎼", |
| | | "👏", |
| | | "😀", |
| | | "😭", |
| | | "🤧", |
| | | "😷", |
| | | } |
| | | |
| | | |
| | | def format_str_v2(s): |
| | | sptk_dict = {} |
| | | for sptk in emoji_dict: |
| | | sptk_dict[sptk] = s.count(sptk) |
| | | s = s.replace(sptk, "") |
| | | emo = "<|NEUTRAL|>" |
| | | for e in emo_dict: |
| | | if sptk_dict[e] > sptk_dict[emo]: |
| | | emo = e |
| | | for e in event_dict: |
| | | if sptk_dict[e] > 0: |
| | | s = event_dict[e] + s |
| | | s = s + emo_dict[emo] |
| | | |
| | | for emoji in emo_set.union(event_set): |
| | | s = s.replace(" " + emoji, emoji) |
| | | s = s.replace(emoji + " ", emoji) |
| | | return s.strip() |
| | | |
| | | |
| | | def rich_transcription_postprocess(s): |
| | | def get_emo(s): |
| | | return s[-1] if s[-1] in emo_set else None |
| | | |
| | | def get_event(s): |
| | | return s[0] if s[0] in event_set else None |
| | | |
| | | s = s.replace("<|nospeech|><|Event_UNK|>", "❓") |
| | | for lang in lang_dict: |
| | | s = s.replace(lang, "<|lang|>") |
| | | s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")] |
| | | new_s = " " + s_list[0] |
| | | cur_ent_event = get_event(new_s) |
| | | for i in range(1, len(s_list)): |
| | | if len(s_list[i]) == 0: |
| | | continue |
| | | if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None: |
| | | s_list[i] = s_list[i][1:] |
| | | # else: |
| | | cur_ent_event = get_event(s_list[i]) |
| | | if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s): |
| | | new_s = new_s[:-1] |
| | | new_s += s_list[i].strip().lstrip() |
| | | new_s = new_s.replace("The.", " ") |
| | | return new_s.strip() |