| funasr/bin/asr_inference_launch.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| funasr/utils/postprocess_utils.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
funasr/bin/asr_inference_launch.py
@@ -498,6 +498,7 @@ ): ncpu = kwargs.get("ncpu", 1) torch.set_num_threads(ncpu) language = kwargs.get("model_lang", None) if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") @@ -704,10 +705,13 @@ text, token, token_int = result[0], result[1], result[2] time_stamp = result[4] if len(result[4]) > 0 else None if use_timestamp and time_stamp is not None and len(time_stamp): postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) if language == "en-bpe": postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token) else: postprocessed_result = postprocess_utils.sentence_postprocess(token) if use_timestamp and time_stamp is not None and len(time_stamp): postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) else: postprocessed_result = postprocess_utils.sentence_postprocess(token) text_postprocessed = "" time_stamp_postprocessed = "" text_postprocessed_punc = postprocessed_result funasr/utils/postprocess_utils.py
@@ -242,4 +242,55 @@ if ch != ' ': real_word_lists.append(ch) sentence = ''.join(word_lists).strip() return sentence, real_word_lists return sentence, real_word_lists def sentence_postprocess_sentencepiece(words): middle_lists = [] word_lists = [] word_item = '' # wash words lists for i in words: word = '' if isinstance(i, str): word = i else: word = i.decode('utf-8') if word in ['<s>', '</s>', '<unk>', '<OOV>']: continue else: middle_lists.append(word) # all alpha characters for i, ch in enumerate(middle_lists): word = '' if '\u2581' in ch and i == 0: word_item = '' word = ch.replace('\u2581', '') word_item += word elif '\u2581' in ch and i != 0: word_lists.append(word_item) word_lists.append(' ') word_item = '' word = ch.replace('\u2581', '') word_item += word else: word_item += ch if word_item is not None: word_lists.append(word_item) #word_lists = abbr_dispose(word_lists) real_word_lists = [] for ch in word_lists: if ch != ' ': if ch == "i": ch = ch.replace("i", "I") elif ch == "i'm": ch = ch.replace("i'm", "I'm") elif ch == "i've": ch = ch.replace("i've", "I've") elif ch == "i'll": ch = ch.replace("i'll", "I'll") real_word_lists.append(ch) sentence = ''.join(word_lists) return sentence, real_word_lists