python/FunASR-XL.git

			@@ -1,23 +1,24 @@
			# Copyright (c) Alibaba, Inc. and its affiliates.

			import string
			import logging
			from typing import Any, List, Union


			def isChinese(ch: str):
			if '\u4e00' <= ch <= '\u9fff':
			if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
			return True
			return False


			def isAllChinese(word: Union[List[Any], str]):
			word_lists = []
			table = str.maketrans('', '', string.punctuation)
			for i in word:
			cur = i.translate(table)
			cur = cur.replace(' ', '')
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			@@ -31,19 +32,19 @@

			def isAllAlpha(word: Union[List[Any], str]):
			word_lists = []
			table = str.maketrans('', '', string.punctuation)
			for i in word:
			cur = i.translate(table)
			cur = cur.replace(' ', '')
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if ch.isalpha() is False:
			if ch.isalpha() is False and ch != "'":
			return False
			elif ch.isalpha() is True and isChinese(ch) is True:
			return False
			@@ -51,12 +52,16 @@
			return True


			def abbr_dispose(words: List[Any]) -> List[Any]:
			# def abbr_dispose(words: List[Any]) -> List[Any]:
			def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
			words_size = len(words)
			word_lists = []
			abbr_begin = []
			abbr_end = []
			last_num = -1
			ts_lists = []
			ts_nums = []
			ts_index = 0
			for num in range(words_size):
			if num <= last_num:
			continue
			@@ -87,30 +92,159 @@
			else:
			break

			for num in range(words_size):
			if words[num] == ' ':
			ts_nums.append(ts_index)
			else:
			ts_nums.append(ts_index)
			ts_index += 1
			last_num = -1
			for num in range(words_size):
			if num <= last_num:
			continue

			if num in abbr_begin:
			word_lists.append(words[num].upper())
			if time_stamp is not None:
			begin = time_stamp[ts_nums[num]][0]
			abbr_word = words[num].upper()
			num += 1
			while num < words_size:
			if num in abbr_end:
			word_lists.append(words[num].upper())
			abbr_word += words[num].upper()
			last_num = num
			break
			else:
			if words[num].encode('utf-8').isalpha():
			word_lists.append(words[num].upper())
			abbr_word += words[num].upper()
			num += 1
			word_lists.append(abbr_word)
			if time_stamp is not None:
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			else:
			word_lists.append(words[num])
			if time_stamp is not None and words[num] != ' ':
			begin = time_stamp[ts_nums[num]][0]
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			begin = end

			return word_lists
			if time_stamp is not None:
			return word_lists, ts_lists
			else:
			return word_lists


			def sentence_postprocess(words: List[Any]):
			def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
			middle_lists = []
			word_lists = []
			word_item = ''
			ts_lists = []

			# wash words lists
			for i in words:
			word = ''
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)

			# all chinese characters
			if isAllChinese(middle_lists):
			for i, ch in enumerate(middle_lists):
			word_lists.append(ch.replace(' ', ''))
			if time_stamp is not None:
			ts_lists = time_stamp

			# all alpha characters
			elif isAllAlpha(middle_lists):
			ts_flag = True
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			else:
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end

			# mix characters
			else:
			alpha_blank = False
			ts_flag = True
			begin = -1
			end = -1
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if isAllChinese(ch):
			if alpha_blank is True:
			word_lists.pop()
			word_lists.append(ch)
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = True
			ts_lists.append([begin, end])
			begin = end
			elif '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			elif isAllAlpha(ch):
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			alpha_blank = True
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end
			else:
			word_lists.append(ch)

			if time_stamp is not None:
			word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ' '.join(real_word_lists).strip()
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence, real_word_lists

			def sentence_postprocess_sentencepiece(words):
			middle_lists = []
			word_lists = []
			word_item = ''
			@@ -123,52 +257,40 @@
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>']:
			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)

			# all chinese characters
			if isAllChinese(middle_lists):
			for ch in middle_lists:
			word_lists.append(ch.replace(' ', ''))

			# all alpha characters
			elif isAllAlpha(middle_lists):
			for ch in middle_lists:
			word = ''
			if '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			else:
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''

			# mix characters
			else:
			alpha_blank = False
			for ch in middle_lists:
			word = ''
			if isAllChinese(ch):
			if alpha_blank is True:
			word_lists.pop()
			word_lists.append(ch)
			alpha_blank = False
			elif '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			alpha_blank = False
			elif isAllAlpha(ch):
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			alpha_blank = True
			else:
			raise ValueError('invalid character: {}'.format(ch))

			word_lists = abbr_dispose(word_lists)
			sentence = ''.join(word_lists).strip()
			return sentence
			for i, ch in enumerate(middle_lists):
			word = ''
			if '\u2581' in ch and i == 0:
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			elif '\u2581' in ch and i != 0:
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			else:
			word_item += ch
			if word_item is not None:
			word_lists.append(word_item)
			#word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			if ch == "i":
			ch = ch.replace("i", "I")
			elif ch == "i'm":
			ch = ch.replace("i'm", "I'm")
			elif ch == "i've":
			ch = ch.replace("i've", "I've")
			elif ch == "i'll":
			ch = ch.replace("i'll", "I'll")
			real_word_lists.append(ch)
			sentence = ''.join(word_lists)
			return sentence, real_word_lists