From 6be52a387938e40961194dfb79d079ab24137b32 Mon Sep 17 00:00:00 2001
From: chong.zhang <chong.zhang@alibaba-inc.com>
Date: 星期五, 05 五月 2023 13:01:25 +0800
Subject: [PATCH] update
---
/dev/null | 245 -------------------------------------------------
1 files changed, 0 insertions(+), 245 deletions(-)
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
deleted file mode 100644
index b607e1d..0000000
--- a/funasr/utils/postprocess_utils.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import string
-import logging
-from typing import Any, List, Union
-
-
-def isChinese(ch: str):
- if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
- return True
- return False
-
-
-def isAllChinese(word: Union[List[Any], str]):
- word_lists = []
- for i in word:
- cur = i.replace(' ', '')
- cur = cur.replace('</s>', '')
- cur = cur.replace('<s>', '')
- cur = cur.replace('<unk>', '')
- cur = cur.replace('<OOV>', '')
- word_lists.append(cur)
-
- if len(word_lists) == 0:
- return False
-
- for ch in word_lists:
- if isChinese(ch) is False:
- return False
- return True
-
-
-def isAllAlpha(word: Union[List[Any], str]):
- word_lists = []
- for i in word:
- cur = i.replace(' ', '')
- cur = cur.replace('</s>', '')
- cur = cur.replace('<s>', '')
- cur = cur.replace('<unk>', '')
- cur = cur.replace('<OOV>', '')
- word_lists.append(cur)
-
- if len(word_lists) == 0:
- return False
-
- for ch in word_lists:
- if ch.isalpha() is False and ch != "'":
- return False
- elif ch.isalpha() is True and isChinese(ch) is True:
- return False
-
- return True
-
-
-# def abbr_dispose(words: List[Any]) -> List[Any]:
-def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
- words_size = len(words)
- word_lists = []
- abbr_begin = []
- abbr_end = []
- last_num = -1
- ts_lists = []
- ts_nums = []
- ts_index = 0
- for num in range(words_size):
- if num <= last_num:
- continue
-
- if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
- if num + 1 < words_size and words[
- num + 1] == ' ' and num + 2 < words_size and len(
- words[num +
- 2]) == 1 and words[num +
- 2].encode('utf-8').isalpha():
- # found the begin of abbr
- abbr_begin.append(num)
- num += 2
- abbr_end.append(num)
- # to find the end of abbr
- while True:
- num += 1
- if num < words_size and words[num] == ' ':
- num += 1
- if num < words_size and len(
- words[num]) == 1 and words[num].encode(
- 'utf-8').isalpha():
- abbr_end.pop()
- abbr_end.append(num)
- last_num = num
- else:
- break
- else:
- break
-
- for num in range(words_size):
- if words[num] == ' ':
- ts_nums.append(ts_index)
- else:
- ts_nums.append(ts_index)
- ts_index += 1
- last_num = -1
- for num in range(words_size):
- if num <= last_num:
- continue
-
- if num in abbr_begin:
- if time_stamp is not None:
- begin = time_stamp[ts_nums[num]][0]
- abbr_word = words[num].upper()
- num += 1
- while num < words_size:
- if num in abbr_end:
- abbr_word += words[num].upper()
- last_num = num
- break
- else:
- if words[num].encode('utf-8').isalpha():
- abbr_word += words[num].upper()
- num += 1
- word_lists.append(abbr_word)
- if time_stamp is not None:
- end = time_stamp[ts_nums[num]][1]
- ts_lists.append([begin, end])
- else:
- word_lists.append(words[num])
- if time_stamp is not None and words[num] != ' ':
- begin = time_stamp[ts_nums[num]][0]
- end = time_stamp[ts_nums[num]][1]
- ts_lists.append([begin, end])
- begin = end
-
- if time_stamp is not None:
- return word_lists, ts_lists
- else:
- return word_lists
-
-
-def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
- middle_lists = []
- word_lists = []
- word_item = ''
- ts_lists = []
-
- # wash words lists
- for i in words:
- word = ''
- if isinstance(i, str):
- word = i
- else:
- word = i.decode('utf-8')
-
- if word in ['<s>', '</s>', '<unk>', '<OOV>']:
- continue
- else:
- middle_lists.append(word)
-
- # all chinese characters
- if isAllChinese(middle_lists):
- for i, ch in enumerate(middle_lists):
- word_lists.append(ch.replace(' ', ''))
- if time_stamp is not None:
- ts_lists = time_stamp
-
- # all alpha characters
- elif isAllAlpha(middle_lists):
- ts_flag = True
- for i, ch in enumerate(middle_lists):
- if ts_flag and time_stamp is not None:
- begin = time_stamp[i][0]
- end = time_stamp[i][1]
- word = ''
- if '@@' in ch:
- word = ch.replace('@@', '')
- word_item += word
- if time_stamp is not None:
- ts_flag = False
- end = time_stamp[i][1]
- else:
- word_item += ch
- word_lists.append(word_item)
- word_lists.append(' ')
- word_item = ''
- if time_stamp is not None:
- ts_flag = True
- end = time_stamp[i][1]
- ts_lists.append([begin, end])
- begin = end
-
- # mix characters
- else:
- alpha_blank = False
- ts_flag = True
- begin = -1
- end = -1
- for i, ch in enumerate(middle_lists):
- if ts_flag and time_stamp is not None:
- begin = time_stamp[i][0]
- end = time_stamp[i][1]
- word = ''
- if isAllChinese(ch):
- if alpha_blank is True:
- word_lists.pop()
- word_lists.append(ch)
- alpha_blank = False
- if time_stamp is not None:
- ts_flag = True
- ts_lists.append([begin, end])
- begin = end
- elif '@@' in ch:
- word = ch.replace('@@', '')
- word_item += word
- alpha_blank = False
- if time_stamp is not None:
- ts_flag = False
- end = time_stamp[i][1]
- elif isAllAlpha(ch):
- word_item += ch
- word_lists.append(word_item)
- word_lists.append(' ')
- word_item = ''
- alpha_blank = True
- if time_stamp is not None:
- ts_flag = True
- end = time_stamp[i][1]
- ts_lists.append([begin, end])
- begin = end
- else:
- word_lists.append(ch)
-
- if time_stamp is not None:
- word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
- real_word_lists = []
- for ch in word_lists:
- if ch != ' ':
- real_word_lists.append(ch)
- sentence = ' '.join(real_word_lists).strip()
- return sentence, ts_lists, real_word_lists
- else:
- word_lists = abbr_dispose(word_lists)
- real_word_lists = []
- for ch in word_lists:
- if ch != ' ':
- real_word_lists.append(ch)
- sentence = ''.join(word_lists).strip()
- return sentence, real_word_lists
--
Gitblit v1.9.1