| | |
| | | langs = tuple(LANGUAGES.keys())[: self.num_languages] |
| | | sot_sequence = [sot] |
| | | if self.language is not None: |
| | | sot_sequence.append(sot + 1 + langs.index(self.language)) |
| | | if self.language == "nospeech": |
| | | sot_sequence.append(self.no_speech) |
| | | else: |
| | | sot_sequence.append(sot + 1 + langs.index(self.language)) |
| | | # if self.language is not None: |
| | | # sot_sequence.append(sot + 1 + langs.index(self.language)) |
| | | if self.task is not None: |
| | | task_token: int = transcribe if self.task == "transcribe" else translate |
| | | sot_sequence.append(task_token) |
| | |
| | | This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>". |
| | | """ |
| | | return self.encoding.decode(token_ids, **kwargs) |
| | | |
| | | |
| | | def get_vocab_size(self) -> int: |
| | | return self.encoding.n_vocab |
| | | |
| | |
| | | keeping basic punctuations like commas, periods, question marks, exclamation points, etc. |
| | | """ |
| | | symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』') |
| | | symbols += ( |
| | | "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() |
| | | ) |
| | | symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split() |
| | | |
| | | # symbols that may be a single token or multiple tokens depending on the tokenizer. |
| | | # In case they're multiple tokens, suppress the first token, which is safe because: |
| | |
| | | |
| | | |
| | | @lru_cache(maxsize=None) |
| | | def get_encoding(name: str = "gpt2", num_languages: int = 99, vocab_path:str=None): |
| | | def get_encoding(name: str = "gpt2", num_languages: int = 99, vocab_path: str = None): |
| | | if vocab_path is None: |
| | | vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken") |
| | | |
| | |
| | | n_vocab = len(ranks) |
| | | special_tokens = {} |
| | | |
| | | if False: #name == "gpt2" or name == "multilingual": |
| | | if False: # name == "gpt2" or name == "multilingual": |
| | | specials = [ |
| | | "<|endoftext|>", |
| | | "<|startoftranscript|>", |
| | |
| | | if language not in LANGUAGES: |
| | | if language in TO_LANGUAGE_CODE: |
| | | language = TO_LANGUAGE_CODE[language] |
| | | elif language == "nospeech": |
| | | pass |
| | | else: |
| | | raise ValueError(f"Unsupported language: {language}") |
| | | |
| | |
| | | if encoding_path is not None: |
| | | encoding_name = encoding_path |
| | | |
| | | |
| | | encoding = get_encoding(name=encoding_name, num_languages=num_languages, vocab_path=vocab_path) |
| | | |
| | | |
| | | return Tokenizer( |
| | | encoding=encoding, num_languages=num_languages, language=language, task=task |
| | | ) |
| | | return Tokenizer(encoding=encoding, num_languages=num_languages, language=language, task=task) |