python/FunASR-XL.git

New file
			@@ -0,0 +1,245 @@
			# Copyright (c) Alibaba, Inc. and its affiliates.

			import string
			import logging
			from typing import Any, List, Union


			def isChinese(ch: str):
			if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
			return True
			return False


			def isAllChinese(word: Union[List[Any], str]):
			word_lists = []
			for i in word:
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if isChinese(ch) is False:
			return False
			return True


			def isAllAlpha(word: Union[List[Any], str]):
			word_lists = []
			for i in word:
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if ch.isalpha() is False and ch != "'":
			return False
			elif ch.isalpha() is True and isChinese(ch) is True:
			return False

			return True


			# def abbr_dispose(words: List[Any]) -> List[Any]:
			def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
			words_size = len(words)
			word_lists = []
			abbr_begin = []
			abbr_end = []
			last_num = -1
			ts_lists = []
			ts_nums = []
			ts_index = 0
			for num in range(words_size):
			if num <= last_num:
			continue

			if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
			if num + 1 < words_size and words[
			num + 1] == ' ' and num + 2 < words_size and len(
			words[num +
			2]) == 1 and words[num +
			2].encode('utf-8').isalpha():
			# found the begin of abbr
			abbr_begin.append(num)
			num += 2
			abbr_end.append(num)
			# to find the end of abbr
			while True:
			num += 1
			if num < words_size and words[num] == ' ':
			num += 1
			if num < words_size and len(
			words[num]) == 1 and words[num].encode(
			'utf-8').isalpha():
			abbr_end.pop()
			abbr_end.append(num)
			last_num = num
			else:
			break
			else:
			break

			for num in range(words_size):
			if words[num] == ' ':
			ts_nums.append(ts_index)
			else:
			ts_nums.append(ts_index)
			ts_index += 1
			last_num = -1
			for num in range(words_size):
			if num <= last_num:
			continue

			if num in abbr_begin:
			if time_stamp is not None:
			begin = time_stamp[ts_nums[num]][0]
			abbr_word = words[num].upper()
			num += 1
			while num < words_size:
			if num in abbr_end:
			abbr_word += words[num].upper()
			last_num = num
			break
			else:
			if words[num].encode('utf-8').isalpha():
			abbr_word += words[num].upper()
			num += 1
			word_lists.append(abbr_word)
			if time_stamp is not None:
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			else:
			word_lists.append(words[num])
			if time_stamp is not None and words[num] != ' ':
			begin = time_stamp[ts_nums[num]][0]
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			begin = end

			if time_stamp is not None:
			return word_lists, ts_lists
			else:
			return word_lists


			def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
			middle_lists = []
			word_lists = []
			word_item = ''
			ts_lists = []

			# wash words lists
			for i in words:
			word = ''
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)

			# all chinese characters
			if isAllChinese(middle_lists):
			for i, ch in enumerate(middle_lists):
			word_lists.append(ch.replace(' ', ''))
			if time_stamp is not None:
			ts_lists = time_stamp

			# all alpha characters
			elif isAllAlpha(middle_lists):
			ts_flag = True
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			else:
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end

			# mix characters
			else:
			alpha_blank = False
			ts_flag = True
			begin = -1
			end = -1
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if isAllChinese(ch):
			if alpha_blank is True:
			word_lists.pop()
			word_lists.append(ch)
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = True
			ts_lists.append([begin, end])
			begin = end
			elif '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			elif isAllAlpha(ch):
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			alpha_blank = True
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end
			else:
			word_lists.append(ch)

			if time_stamp is not None:
			word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ' '.join(real_word_lists).strip()
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence, real_word_lists