From a73123bcfc14370b74b17084bc124f00c48613e4 Mon Sep 17 00:00:00 2001
From: smohan-speech <smohan@mail.ustc.edu.cn>
Date: 星期六, 06 五月 2023 16:17:48 +0800
Subject: [PATCH] add speaker-attributed ASR task for alimeeting
---
funasr/utils/postprocess_utils.py | 10 +++++++---
1 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index 575fb90..014a79f 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -6,7 +6,7 @@
def isChinese(ch: str):
- if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+ if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
return True
return False
@@ -17,6 +17,8 @@
cur = i.replace(' ', '')
cur = cur.replace('</s>', '')
cur = cur.replace('<s>', '')
+ cur = cur.replace('<unk>', '')
+ cur = cur.replace('<OOV>', '')
word_lists.append(cur)
if len(word_lists) == 0:
@@ -34,6 +36,8 @@
cur = i.replace(' ', '')
cur = cur.replace('</s>', '')
cur = cur.replace('<s>', '')
+ cur = cur.replace('<unk>', '')
+ cur = cur.replace('<OOV>', '')
word_lists.append(cur)
if len(word_lists) == 0:
@@ -144,7 +148,7 @@
else:
word = i.decode('utf-8')
- if word in ['<s>', '</s>', '<unk>']:
+ if word in ['<s>', '</s>', '<unk>', '<OOV>']:
continue
else:
middle_lists.append(word)
@@ -220,7 +224,7 @@
ts_lists.append([begin, end])
begin = end
else:
- raise ValueError('invalid character: {}'.format(ch))
+ word_lists.append(ch)
if time_stamp is not None:
word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
--
Gitblit v1.9.1