python/FunASR-XL.git

			@@ -1,23 +1,24 @@
			# Copyright (c) Alibaba, Inc. and its affiliates.

			import string
			import logging
			from typing import Any, List, Union


			def isChinese(ch: str):
			if '\u4e00' <= ch <= '\u9fff':
			if "\u4e00" <= ch <= "\u9fff" or "\u0030" <= ch <= "\u0039" or ch == "@":
			return True
			return False


			def isAllChinese(word: Union[List[Any], str]):
			word_lists = []
			table = str.maketrans('', '', string.punctuation)
			for i in word:
			cur = i.translate(table)
			cur = cur.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = i.replace(" ", "")
			cur = cur.replace("</s>", "")
			cur = cur.replace("<s>", "")
			cur = cur.replace("<unk>", "")
			cur = cur.replace("<OOV>", "")
			word_lists.append(cur)

			if len(word_lists) == 0:
			@@ -31,19 +32,19 @@

			def isAllAlpha(word: Union[List[Any], str]):
			word_lists = []
			table = str.maketrans('', '', string.punctuation)
			for i in word:
			cur = i.translate(table)
			cur = cur.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = i.replace(" ", "")
			cur = cur.replace("</s>", "")
			cur = cur.replace("<s>", "")
			cur = cur.replace("<unk>", "")
			cur = cur.replace("<OOV>", "")
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if ch.isalpha() is False:
			if ch.isalpha() is False and ch != "'":
			return False
			elif ch.isalpha() is True and isChinese(ch) is True:
			return False
			@@ -51,22 +52,28 @@
			return True


			def abbr_dispose(words: List[Any]) -> List[Any]:
			# def abbr_dispose(words: List[Any]) -> List[Any]:
			def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
			words_size = len(words)
			word_lists = []
			abbr_begin = []
			abbr_end = []
			last_num = -1
			ts_lists = []
			ts_nums = []
			ts_index = 0
			for num in range(words_size):
			if num <= last_num:
			continue

			if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
			if num + 1 < words_size and words[
			num + 1] == ' ' and num + 2 < words_size and len(
			words[num +
			2]) == 1 and words[num +
			2].encode('utf-8').isalpha():
			if len(words[num]) == 1 and words[num].encode("utf-8").isalpha():
			if (
			num + 1 < words_size
			and words[num + 1] == " "
			and num + 2 < words_size
			and len(words[num + 2]) == 1
			and words[num + 2].encode("utf-8").isalpha()
			):
			# found the begin of abbr
			abbr_begin.append(num)
			num += 2
			@@ -74,11 +81,13 @@
			# to find the end of abbr
			while True:
			num += 1
			if num < words_size and words[num] == ' ':
			if num < words_size and words[num] == " ":
			num += 1
			if num < words_size and len(
			words[num]) == 1 and words[num].encode(
			'utf-8').isalpha():
			if (
			num < words_size
			and len(words[num]) == 1
			and words[num].encode("utf-8").isalpha()
			):
			abbr_end.pop()
			abbr_end.append(num)
			last_num = num
			@@ -87,88 +96,206 @@
			else:
			break

			for num in range(words_size):
			if words[num] == " ":
			ts_nums.append(ts_index)
			else:
			ts_nums.append(ts_index)
			ts_index += 1
			last_num = -1
			for num in range(words_size):
			if num <= last_num:
			continue

			if num in abbr_begin:
			word_lists.append(words[num].upper())
			if time_stamp is not None:
			begin = time_stamp[ts_nums[num]][0]
			abbr_word = words[num].upper()
			num += 1
			while num < words_size:
			if num in abbr_end:
			word_lists.append(words[num].upper())
			abbr_word += words[num].upper()
			last_num = num
			break
			else:
			if words[num].encode('utf-8').isalpha():
			word_lists.append(words[num].upper())
			if words[num].encode("utf-8").isalpha():
			abbr_word += words[num].upper()
			num += 1
			word_lists.append(abbr_word)
			if time_stamp is not None:
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			else:
			word_lists.append(words[num])
			if time_stamp is not None and words[num] != " ":
			begin = time_stamp[ts_nums[num]][0]
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			begin = end

			return word_lists
			if time_stamp is not None:
			return word_lists, ts_lists
			else:
			return word_lists


			def sentence_postprocess(words: List[Any]):
			def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
			middle_lists = []
			word_lists = []
			word_item = ''
			word_item = ""
			ts_lists = []

			# wash words lists
			for i in words:
			word = ''
			word = ""
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')
			word = i.decode("utf-8")

			if word in ['<s>', '</s>', '<unk>']:
			if word in ["<s>", "</s>", "<unk>", "<OOV>"]:
			continue
			else:
			middle_lists.append(word)

			# all chinese characters
			if isAllChinese(middle_lists):
			for ch in middle_lists:
			word_lists.append(ch.replace(' ', ''))
			for i, ch in enumerate(middle_lists):
			word_lists.append(ch.replace(" ", ""))
			if time_stamp is not None:
			ts_lists = time_stamp

			# all alpha characters
			elif isAllAlpha(middle_lists):
			for ch in middle_lists:
			word = ''
			if '@@' in ch:
			word = ch.replace('@@', '')
			ts_flag = True
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ""
			if "@@" in ch:
			word = ch.replace("@@", "")
			word_item += word
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			else:
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			word_lists.append(" ")
			word_item = ""
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end

			# mix characters
			else:
			alpha_blank = False
			for ch in middle_lists:
			word = ''
			ts_flag = True
			begin = -1
			end = -1
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ""
			if isAllChinese(ch):
			if alpha_blank is True:
			word_lists.pop()
			word_lists.append(ch)
			alpha_blank = False
			elif '@@' in ch:
			word = ch.replace('@@', '')
			if time_stamp is not None:
			ts_flag = True
			ts_lists.append([begin, end])
			begin = end
			elif "@@" in ch:
			word = ch.replace("@@", "")
			word_item += word
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			elif isAllAlpha(ch):
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			word_lists.append(" ")
			word_item = ""
			alpha_blank = True
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end
			else:
			raise ValueError('invalid character: {}'.format(ch))
			word_lists.append(ch)

			word_lists = abbr_dispose(word_lists)
			sentence = ''.join(word_lists).strip()
			return sentence
			if time_stamp is not None:
			word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != " ":
			real_word_lists.append(ch)
			sentence = " ".join(real_word_lists).strip()
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != " ":
			real_word_lists.append(ch)
			sentence = "".join(word_lists).strip()
			return sentence, real_word_lists


			def sentence_postprocess_sentencepiece(words):
			middle_lists = []
			word_lists = []
			word_item = ""

			# wash words lists
			for i in words:
			word = ""
			if isinstance(i, str):
			word = i
			else:
			word = i.decode("utf-8")

			if word in ["<s>", "</s>", "<unk>", "<OOV>"]:
			continue
			else:
			middle_lists.append(word)

			# all alpha characters
			for i, ch in enumerate(middle_lists):
			word = ""
			if "\u2581" in ch and i == 0:
			word_item = ""
			word = ch.replace("\u2581", "")
			word_item += word
			elif "\u2581" in ch and i != 0:
			word_lists.append(word_item)
			word_lists.append(" ")
			word_item = ""
			word = ch.replace("\u2581", "")
			word_item += word
			else:
			word_item += ch
			if word_item is not None:
			word_lists.append(word_item)
			# word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != " ":
			if ch == "i":
			ch = ch.replace("i", "I")
			elif ch == "i'm":
			ch = ch.replace("i'm", "I'm")
			elif ch == "i've":
			ch = ch.replace("i've", "I've")
			elif ch == "i'll":
			ch = ch.replace("i'll", "I'll")
			real_word_lists.append(ch)
			sentence = "".join(word_lists)
			return sentence, real_word_lists