游雁
2024-06-24 1596f6f414f6f41da66506debb1dff19fffeb3ec
funasr/models/sense_voice/whisper_lib/tokenizer.py
@@ -179,7 +179,7 @@
        langs = tuple(LANGUAGES.keys())[: self.num_languages]
        sot_sequence = [sot]
        if self.language is not None:
            if self.language == 'nospeech':
            if self.language == "nospeech":
                sot_sequence.append(self.no_speech)
            else:
                sot_sequence.append(sot + 1 + langs.index(self.language))
@@ -204,7 +204,7 @@
        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
        """
        return self.encoding.decode(token_ids, **kwargs)
    def get_vocab_size(self) -> int:
        return self.encoding.n_vocab
@@ -291,9 +291,7 @@
        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
        """
        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
        symbols += (
            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
        )
        symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
        # symbols that may be a single token or multiple tokens depending on the tokenizer.
        # In case they're multiple tokens, suppress the first token, which is safe because:
@@ -368,7 +366,7 @@
@lru_cache(maxsize=None)
def get_encoding(name: str = "gpt2", num_languages: int = 99, vocab_path:str=None):
def get_encoding(name: str = "gpt2", num_languages: int = 99, vocab_path: str = None):
    if vocab_path is None:
        vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
@@ -379,7 +377,7 @@
    n_vocab = len(ranks)
    special_tokens = {}
    if False: #name == "gpt2" or name == "multilingual":
    if False:  # name == "gpt2" or name == "multilingual":
        specials = [
            "<|endoftext|>",
            "<|startoftranscript|>",
@@ -437,7 +435,7 @@
        if language not in LANGUAGES:
            if language in TO_LANGUAGE_CODE:
                language = TO_LANGUAGE_CODE[language]
            elif language == 'nospeech':
            elif language == "nospeech":
                pass
            else:
                raise ValueError(f"Unsupported language: {language}")
@@ -453,10 +451,6 @@
    if encoding_path is not None:
        encoding_name = encoding_path
    encoding = get_encoding(name=encoding_name, num_languages=num_languages, vocab_path=vocab_path)
    return Tokenizer(
        encoding=encoding, num_languages=num_languages, language=language, task=task
    )
    return Tokenizer(encoding=encoding, num_languages=num_languages, language=language, task=task)