From db149dd89790040d19c10c0f0ebf52753f851dcf Mon Sep 17 00:00:00 2001
From: 北念 <lzr265946@alibaba-inc.com>
Date: 星期四, 09 十一月 2023 11:49:56 +0800
Subject: [PATCH] fix paraformer-en model python onnx postprocess

---
 runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py b/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
index c005fc9..14d6c76 100644
--- a/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
+++ b/runtime/python/onnxruntime/funasr_onnx/utils/postprocess_utils.py
@@ -240,3 +240,54 @@
                 real_word_lists.append(ch)
         sentence = ''.join(word_lists).strip()
         return sentence, real_word_lists
+
+def sentence_postprocess_sentencepiece(words):
+    middle_lists = []
+    word_lists = []
+    word_item = ''
+
+    # wash words lists
+    for i in words:
+        word = ''
+        if isinstance(i, str):
+            word = i
+        else:
+            word = i.decode('utf-8')
+
+        if word in ['<s>', '</s>', '<unk>', '<OOV>']:
+            continue
+        else:
+            middle_lists.append(word)
+
+    # all alpha characters
+    for i, ch in enumerate(middle_lists):
+        word = ''
+        if '\u2581' in ch and i == 0:
+            word_item = ''
+            word = ch.replace('\u2581', '')
+            word_item += word
+        elif '\u2581' in ch and i != 0:
+            word_lists.append(word_item)
+            word_lists.append(' ')
+            word_item = ''
+            word = ch.replace('\u2581', '')
+            word_item += word
+        else:
+            word_item += ch
+    if word_item is not None:
+        word_lists.append(word_item)
+    #word_lists = abbr_dispose(word_lists)
+    real_word_lists = []
+    for ch in word_lists:
+        if ch != ' ':
+            if ch == "i":
+                ch = ch.replace("i", "I")
+            elif ch == "i'm":
+                ch = ch.replace("i'm", "I'm")
+            elif ch == "i've":
+                ch = ch.replace("i've", "I've")
+            elif ch == "i'll":
+                ch = ch.replace("i'll", "I'll")
+            real_word_lists.append(ch)
+    sentence = ''.join(word_lists)
+    return sentence, real_word_lists
\ No newline at end of file

--
Gitblit v1.9.1