From 94de39dde2e616a01683c518023d0fab72b4e103 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 19 二月 2024 22:21:50 +0800
Subject: [PATCH] aishell example
---
funasr/utils/postprocess_utils.py | 74 +++++++++++++++++++++++++++++++++---
1 files changed, 67 insertions(+), 7 deletions(-)
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index 4da0d59..efba755 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -6,7 +6,7 @@
def isChinese(ch: str):
- if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+ if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
return True
return False
@@ -17,6 +17,8 @@
cur = i.replace(' ', '')
cur = cur.replace('</s>', '')
cur = cur.replace('<s>', '')
+ cur = cur.replace('<unk>', '')
+ cur = cur.replace('<OOV>', '')
word_lists.append(cur)
if len(word_lists) == 0:
@@ -34,6 +36,8 @@
cur = i.replace(' ', '')
cur = cur.replace('</s>', '')
cur = cur.replace('<s>', '')
+ cur = cur.replace('<unk>', '')
+ cur = cur.replace('<OOV>', '')
word_lists.append(cur)
if len(word_lists) == 0:
@@ -102,17 +106,18 @@
if num in abbr_begin:
if time_stamp is not None:
begin = time_stamp[ts_nums[num]][0]
- word_lists.append(words[num].upper())
+ abbr_word = words[num].upper()
num += 1
while num < words_size:
if num in abbr_end:
- word_lists.append(words[num].upper())
+ abbr_word += words[num].upper()
last_num = num
break
else:
if words[num].encode('utf-8').isalpha():
- word_lists.append(words[num].upper())
+ abbr_word += words[num].upper()
num += 1
+ word_lists.append(abbr_word)
if time_stamp is not None:
end = time_stamp[ts_nums[num]][1]
ts_lists.append([begin, end])
@@ -144,7 +149,7 @@
else:
word = i.decode('utf-8')
- if word in ['<s>', '</s>', '<unk>']:
+ if word in ['<s>', '</s>', '<unk>', '<OOV>']:
continue
else:
middle_lists.append(word)
@@ -220,7 +225,7 @@
ts_lists.append([begin, end])
begin = end
else:
- raise ValueError('invalid character: {}'.format(ch))
+ word_lists.append(ch)
if time_stamp is not None:
word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
@@ -232,5 +237,60 @@
return sentence, ts_lists, real_word_lists
else:
word_lists = abbr_dispose(word_lists)
+ real_word_lists = []
+ for ch in word_lists:
+ if ch != ' ':
+ real_word_lists.append(ch)
sentence = ''.join(word_lists).strip()
- return sentence
+ return sentence, real_word_lists
+
+def sentence_postprocess_sentencepiece(words):
+ middle_lists = []
+ word_lists = []
+ word_item = ''
+
+ # wash words lists
+ for i in words:
+ word = ''
+ if isinstance(i, str):
+ word = i
+ else:
+ word = i.decode('utf-8')
+
+ if word in ['<s>', '</s>', '<unk>', '<OOV>']:
+ continue
+ else:
+ middle_lists.append(word)
+
+ # all alpha characters
+ for i, ch in enumerate(middle_lists):
+ word = ''
+ if '\u2581' in ch and i == 0:
+ word_item = ''
+ word = ch.replace('\u2581', '')
+ word_item += word
+ elif '\u2581' in ch and i != 0:
+ word_lists.append(word_item)
+ word_lists.append(' ')
+ word_item = ''
+ word = ch.replace('\u2581', '')
+ word_item += word
+ else:
+ word_item += ch
+ if word_item is not None:
+ word_lists.append(word_item)
+ #word_lists = abbr_dispose(word_lists)
+ real_word_lists = []
+ for ch in word_lists:
+ if ch != ' ':
+ if ch == "i":
+ ch = ch.replace("i", "I")
+ elif ch == "i'm":
+ ch = ch.replace("i'm", "I'm")
+ elif ch == "i've":
+ ch = ch.replace("i've", "I've")
+ elif ch == "i'll":
+ ch = ch.replace("i'll", "I'll")
+ real_word_lists.append(ch)
+ sentence = ''.join(word_lists)
+ return sentence, real_word_lists
\ No newline at end of file
--
Gitblit v1.9.1