python/FunASR-XL.git

			@@ -179,7 +179,7 @@
			langs = tuple(LANGUAGES.keys())[: self.num_languages]
			sot_sequence = [sot]
			if self.language is not None:
			if self.language == 'nospeech':
			if self.language == "nospeech":
			sot_sequence.append(self.no_speech)
			else:
			sot_sequence.append(sot + 1 + langs.index(self.language))
			@@ -291,9 +291,7 @@
			keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
			"""
			symbols = list('"#()*+/:;<=>@[\\]^_`{\|}~「」『』')
			symbols += (
			"<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
			)
			symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()

			# symbols that may be a single token or multiple tokens depending on the tokenizer.
			# In case they're multiple tokens, suppress the first token, which is safe because:
			@@ -437,7 +435,7 @@
			if language not in LANGUAGES:
			if language in TO_LANGUAGE_CODE:
			language = TO_LANGUAGE_CODE[language]
			elif language == 'nospeech':
			elif language == "nospeech":
			pass
			else:
			raise ValueError(f"Unsupported language: {language}")
			@@ -453,10 +451,6 @@
			if encoding_path is not None:
			encoding_name = encoding_path


			encoding = get_encoding(name=encoding_name, num_languages=num_languages, vocab_path=vocab_path)


			return Tokenizer(
			encoding=encoding, num_languages=num_languages, language=language, task=task
			)
			return Tokenizer(encoding=encoding, num_languages=num_languages, language=language, task=task)