From 129cfcd9f283dea0d64f2e20b77662febc2d802c Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 23 三月 2023 10:01:32 +0800
Subject: [PATCH] cer tool

---
 funasr/utils/postprocess_utils.py |   21 +++++++++++++++------
 1 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index 4da0d59..40756d8 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -6,7 +6,7 @@
 
 
 def isChinese(ch: str):
-    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
         return True
     return False
 
@@ -17,6 +17,8 @@
         cur = i.replace(' ', '')
         cur = cur.replace('</s>', '')
         cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
         word_lists.append(cur)
 
     if len(word_lists) == 0:
@@ -34,6 +36,8 @@
         cur = i.replace(' ', '')
         cur = cur.replace('</s>', '')
         cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
         word_lists.append(cur)
 
     if len(word_lists) == 0:
@@ -102,17 +106,18 @@
         if num in abbr_begin:
             if time_stamp is not None:
                 begin = time_stamp[ts_nums[num]][0]
-            word_lists.append(words[num].upper())
+            abbr_word = words[num].upper()
             num += 1
             while num < words_size:
                 if num in abbr_end:
-                    word_lists.append(words[num].upper())
+                    abbr_word += words[num].upper()
                     last_num = num
                     break
                 else:
                     if words[num].encode('utf-8').isalpha():
-                        word_lists.append(words[num].upper())
+                        abbr_word += words[num].upper()
                 num += 1
+            word_lists.append(abbr_word)
             if time_stamp is not None:
                 end = time_stamp[ts_nums[num]][1]
                 ts_lists.append([begin, end])
@@ -144,7 +149,7 @@
         else:
             word = i.decode('utf-8')
 
-        if word in ['<s>', '</s>', '<unk>']:
+        if word in ['<s>', '</s>', '<unk>', '<OOV>']:
             continue
         else:
             middle_lists.append(word)
@@ -232,5 +237,9 @@
         return sentence, ts_lists, real_word_lists
     else:
         word_lists = abbr_dispose(word_lists)
+        real_word_lists = []
+        for ch in word_lists:
+            if ch != ' ':
+                real_word_lists.append(ch)
         sentence = ''.join(word_lists).strip()
-        return sentence
+        return sentence, real_word_lists

--
Gitblit v1.9.1