From dfcc5d47587d3e793cbfec2e9509c0e9a9e1732c Mon Sep 17 00:00:00 2001
From: Dogvane Huang <dogvane@gmail.com>
Date: 星期二, 02 七月 2024 12:24:13 +0800
Subject: [PATCH] fix c# demo project to new onnx model files (#1689)
---
funasr/tokenizer/sentencepiece_tokenizer.py | 14 ++++++++++++--
1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/funasr/tokenizer/sentencepiece_tokenizer.py b/funasr/tokenizer/sentencepiece_tokenizer.py
index ff4b3a2..0b47a9f 100644
--- a/funasr/tokenizer/sentencepiece_tokenizer.py
+++ b/funasr/tokenizer/sentencepiece_tokenizer.py
@@ -20,6 +20,7 @@
# "TypeError: can't pickle SwigPyObject objects",
# when giving it as argument of "multiprocessing.Process()".
self.sp = None
+ self._build_sentence_piece_processor()
def __repr__(self):
return f'{self.__class__.__name__}(model="{self.bpemodel}")'
@@ -38,10 +39,19 @@
self._build_sentence_piece_processor()
return self.sp.DecodePieces(list(tokens))
- def encode(self, line: str) -> List[int]:
+ def encode(self, line: str, **kwargs) -> List[int]:
self._build_sentence_piece_processor()
return self.sp.EncodeAsIds(line)
- def decode(self, line: List[int]):
+ def decode(self, line: List[int], **kwargs):
self._build_sentence_piece_processor()
return self.sp.DecodeIds(line)
+
+ def get_vocab_size(self):
+ return self.sp.GetPieceSize()
+
+ def ids2tokens(self, *args, **kwargs):
+ return self.decode(*args, **kwargs)
+
+ def tokens2ids(self, *args, **kwargs):
+ return self.encode(*args, **kwargs)
--
Gitblit v1.9.1