python/FunASR-XL.git

			@@ -498,6 +498,7 @@
			):
			ncpu = kwargs.get("ncpu", 1)
			torch.set_num_threads(ncpu)
			language = kwargs.get("model_lang", None)

			if word_lm_train_config is not None:
			raise NotImplementedError("Word LM is not implemented")
			@@ -704,10 +705,13 @@
			text, token, token_int = result[0], result[1], result[2]
			time_stamp = result[4] if len(result[4]) > 0 else None

			if use_timestamp and time_stamp is not None and len(time_stamp):
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			if language == "en-bpe":
			postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			if use_timestamp and time_stamp is not None and len(time_stamp):
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			text_postprocessed = ""
			time_stamp_postprocessed = ""
			text_postprocessed_punc = postprocessed_result

			@@ -242,4 +242,55 @@
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence, real_word_lists
			return sentence, real_word_lists

			def sentence_postprocess_sentencepiece(words):
			middle_lists = []
			word_lists = []
			word_item = ''

			# wash words lists
			for i in words:
			word = ''
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>', '<OOV>']:
			continue
			else:
			middle_lists.append(word)

			# all alpha characters
			for i, ch in enumerate(middle_lists):
			word = ''
			if '\u2581' in ch and i == 0:
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			elif '\u2581' in ch and i != 0:
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			word = ch.replace('\u2581', '')
			word_item += word
			else:
			word_item += ch
			if word_item is not None:
			word_lists.append(word_item)
			#word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			if ch == "i":
			ch = ch.replace("i", "I")
			elif ch == "i'm":
			ch = ch.replace("i'm", "I'm")
			elif ch == "i've":
			ch = ch.replace("i've", "I've")
			elif ch == "i'll":
			ch = ch.replace("i'll", "I'll")
			real_word_lists.append(ch)
			sentence = ''.join(word_lists)
			return sentence, real_word_lists

	funasr/bin/asr_inference_launch.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/postprocess_utils.py	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史