From f43da18b5e097515c12438c70fc7918c29e5d0c8 Mon Sep 17 00:00:00 2001
From: lingji-yidong <75744976+lingji-yidong@users.noreply.github.com>
Date: 星期一, 19 八月 2024 13:36:59 +0800
Subject: [PATCH] fix start timestamp in sentence_info (#2024)
---
funasr/tokenizer/sentencepiece_tokenizer.py | 14 ++++++++++++--
1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/funasr/tokenizer/sentencepiece_tokenizer.py b/funasr/tokenizer/sentencepiece_tokenizer.py
index ff4b3a2..0b47a9f 100644
--- a/funasr/tokenizer/sentencepiece_tokenizer.py
+++ b/funasr/tokenizer/sentencepiece_tokenizer.py
@@ -20,6 +20,7 @@
# "TypeError: can't pickle SwigPyObject objects",
# when giving it as argument of "multiprocessing.Process()".
self.sp = None
+ self._build_sentence_piece_processor()
def __repr__(self):
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
@@ -38,10 +39,19 @@
self._build_sentence_piece_processor()
return self.sp.DecodePieces(list(tokens))
- def encode(self, line: str) -> List[int]:
+ def encode(self, line: str, **kwargs) -> List[int]:
self._build_sentence_piece_processor()
return self.sp.EncodeAsIds(line)
- def decode(self, line: List[int]):
+ def decode(self, line: List[int], **kwargs):
self._build_sentence_piece_processor()
return self.sp.DecodeIds(line)
+
+ def get_vocab_size(self):
+ return self.sp.GetPieceSize()
+
+ def ids2tokens(self, *args, **kwargs):
+ return self.decode(*args, **kwargs)
+
+ def tokens2ids(self, *args, **kwargs):
+ return self.encode(*args, **kwargs)
--
Gitblit v1.9.1