From d19f48e17478be273584853568ac101c994c37e5 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 08 四月 2024 18:51:53 +0800
Subject: [PATCH] Dev gzf exp (#1593)
---
funasr/models/sense_voice/whisper_lib/tokenizer.py | 11 ++++++++---
1 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/funasr/models/sense_voice/whisper_lib/tokenizer.py b/funasr/models/sense_voice/whisper_lib/tokenizer.py
index e941fb2..463ce83 100644
--- a/funasr/models/sense_voice/whisper_lib/tokenizer.py
+++ b/funasr/models/sense_voice/whisper_lib/tokenizer.py
@@ -363,8 +363,10 @@
@lru_cache(maxsize=None)
-def get_encoding(name: str = "gpt2", num_languages: int = 99):
- vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+def get_encoding(name: str = "gpt2", num_languages: int = 99, vocab_path:str=None):
+ if vocab_path is None:
+ vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
+
ranks = {
base64.b64decode(token): int(rank)
for token, rank in (line.split() for line in open(vocab_path) if line)
@@ -423,6 +425,7 @@
language: Optional[str] = None,
task: Optional[str] = None, # Literal["transcribe", "translate", None]
encoding_path: Optional[str] = None,
+ vocab_path: Optional[str] = None,
) -> Tokenizer:
if language is not None:
language = language.lower()
@@ -443,7 +446,9 @@
if encoding_path is not None:
encoding_name = encoding_path
- encoding = get_encoding(name=encoding_name, num_languages=num_languages)
+
+ encoding = get_encoding(name=encoding_name, num_languages=num_languages, vocab_path=vocab_path)
+
return Tokenizer(
encoding=encoding, num_languages=num_languages, language=language, task=task
--
Gitblit v1.9.1