python/FunASR-XL.git

			@@ -6,7 +6,7 @@


			def isChinese(ch: str):
			if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
			if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
			return True
			return False

			@@ -17,6 +17,8 @@
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			@@ -34,6 +36,8 @@
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			cur = cur.replace('<unk>', '')
			cur = cur.replace('<OOV>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			@@ -102,17 +106,18 @@
			if num in abbr_begin:
			if time_stamp is not None:
			begin = time_stamp[ts_nums[num]][0]
			word_lists.append(words[num].upper())
			abbr_word = words[num].upper()
			num += 1
			while num < words_size:
			if num in abbr_end:
			word_lists.append(words[num].upper())
			abbr_word += words[num].upper()
			last_num = num
			break
			else:
			if words[num].encode('utf-8').isalpha():
			word_lists.append(words[num].upper())
			abbr_word += words[num].upper()
			num += 1
			word_lists.append(abbr_word)
			if time_stamp is not None:
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			@@ -144,7 +149,7 @@
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>']:
			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)
			@@ -220,7 +225,7 @@
			ts_lists.append([begin, end])
			begin = end
			else:
			raise ValueError('invalid character: {}'.format(ch))
			word_lists.append(ch)

			if time_stamp is not None:
			word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
			@@ -232,5 +237,60 @@
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence
			return sentence, real_word_lists

			def sentence_postprocess_sentencepiece(words):
			middle_lists = []
			word_lists = []
			word_item = ''

			# wash words lists
			for i in words:
			word = ''
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)

			# all alpha characters
			for i, ch in enumerate(middle_lists):
			word = ''
			if '\u2581' in ch and i == 0:
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			elif '\u2581' in ch and i != 0:
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			else:
			word_item += ch
			if word_item is not None:
			word_lists.append(word_item)
			#word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			if ch == "i":
			ch = ch.replace("i", "I")
			elif ch == "i'm":
			ch = ch.replace("i'm", "I'm")
			elif ch == "i've":
			ch = ch.replace("i've", "I've")
			elif ch == "i'll":
			ch = ch.replace("i'll", "I'll")
			real_word_lists.append(ch)
			sentence = ''.join(word_lists)
			return sentence, real_word_lists