From 35caed5dbc9eb83efab3051ed6b7504d42ae652b Mon Sep 17 00:00:00 2001
From: Lizerui9926 <110582652+Lizerui9926@users.noreply.github.com>
Date: 星期二, 10 十月 2023 16:00:50 +0800
Subject: [PATCH] Merge pull request #996 from alibaba-damo-academy/dev_lzr_en
---
funasr/bin/asr_inference_launch.py | 10 +++-
funasr/utils/postprocess_utils.py | 53 ++++++++++++++++++++++++++
2 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index f4140e1..1288777 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -498,6 +498,7 @@
):
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
+ language = kwargs.get("model_lang", None)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
@@ -704,10 +705,13 @@
text, token, token_int = result[0], result[1], result[2]
time_stamp = result[4] if len(result[4]) > 0 else None
- if use_timestamp and time_stamp is not None and len(time_stamp):
- postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+ if language == "en-bpe":
+ postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
else:
- postprocessed_result = postprocess_utils.sentence_postprocess(token)
+ if use_timestamp and time_stamp is not None and len(time_stamp):
+ postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+ else:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token)
text_postprocessed = ""
time_stamp_postprocessed = ""
text_postprocessed_punc = postprocessed_result
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index f4efea6..efba755 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -242,4 +242,55 @@
if ch != ' ':
real_word_lists.append(ch)
sentence = ''.join(word_lists).strip()
- return sentence, real_word_lists
\ No newline at end of file
+ return sentence, real_word_lists
+
+def sentence_postprocess_sentencepiece(words):
+ middle_lists = []
+ word_lists = []
+ word_item = ''
+
+ # wash words lists
+ for i in words:
+ word = ''
+ if isinstance(i, str):
+ word = i
+ else:
+ word = i.decode('utf-8')
+
+ if word in ['<s>', '</s>', '<unk>', '<OOV>']:
+ continue
+ else:
+ middle_lists.append(word)
+
+ # all alpha characters
+ for i, ch in enumerate(middle_lists):
+ word = ''
+ if '\u2581' in ch and i == 0:
+ word_item = ''
+ word = ch.replace('\u2581', '')
+ word_item += word
+ elif '\u2581' in ch and i != 0:
+ word_lists.append(word_item)
+ word_lists.append(' ')
+ word_item = ''
+ word = ch.replace('\u2581', '')
+ word_item += word
+ else:
+ word_item += ch
+ if word_item is not None:
+ word_lists.append(word_item)
+ #word_lists = abbr_dispose(word_lists)
+ real_word_lists = []
+ for ch in word_lists:
+ if ch != ' ':
+ if ch == "i":
+ ch = ch.replace("i", "I")
+ elif ch == "i'm":
+ ch = ch.replace("i'm", "I'm")
+ elif ch == "i've":
+ ch = ch.replace("i've", "I've")
+ elif ch == "i'll":
+ ch = ch.replace("i'll", "I'll")
+ real_word_lists.append(ch)
+ sentence = ''.join(word_lists)
+ return sentence, real_word_lists
\ No newline at end of file
--
Gitblit v1.9.1