From a0f03bd2a87d97d47a1636bbe6f0855a43160331 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 15 五月 2024 19:48:50 +0800
Subject: [PATCH] Dev gzf deepspeed (#1732)
---
funasr/tokenizer/sentencepiece_tokenizer.py | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/funasr/tokenizer/sentencepiece_tokenizer.py b/funasr/tokenizer/sentencepiece_tokenizer.py
index ff4b3a2..1be1b81 100644
--- a/funasr/tokenizer/sentencepiece_tokenizer.py
+++ b/funasr/tokenizer/sentencepiece_tokenizer.py
@@ -20,6 +20,7 @@
# "TypeError: can't pickle SwigPyObject objects",
# when giving it as argument of "multiprocessing.Process()".
self.sp = None
+ self._build_sentence_piece_processor()
def __repr__(self):
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
@@ -38,10 +39,13 @@
self._build_sentence_piece_processor()
return self.sp.DecodePieces(list(tokens))
- def encode(self, line: str) -> List[int]:
+ def encode(self, line: str, **kwargs) -> List[int]:
self._build_sentence_piece_processor()
return self.sp.EncodeAsIds(line)
- def decode(self, line: List[int]):
+ def decode(self, line: List[int], **kwargs):
self._build_sentence_piece_processor()
return self.sp.DecodeIds(line)
+
+ def get_vocab_size(self):
+ return self.sp.GetPieceSize()
--
Gitblit v1.9.1