From 7ab2e5cf22bbb31808bcacf84c054c710e4e6a93 Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期一, 24 四月 2023 16:19:17 +0800
Subject: [PATCH] Merge pull request #400 from alibaba-damo-academy/dev_knf
---
funasr/runtime/onnxruntime/src/tokenizer.h | 27 +++++++++++++++++++++++++++
1 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/funasr/runtime/onnxruntime/src/tokenizer.h b/funasr/runtime/onnxruntime/src/tokenizer.h
new file mode 100644
index 0000000..319975a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/tokenizer.h
@@ -0,0 +1,27 @@
+#pragma once
+#include "yaml-cpp/yaml.h"
+
+class CTokenizer {
+private:
+
+ bool m_ready = false;
+ vector<string> m_id2token,m_id2punc;
+ map<string, int> m_token2id,m_punc2id;
+
+public:
+
+ CTokenizer(const char* sz_yamlfile);
+ CTokenizer();
+ bool OpenYaml(const char* sz_yamlfile);
+ void ReadYaml(const YAML::Node& node);
+ vector<string> Id2String(vector<int> input);
+ vector<int> String2Ids(vector<string> input);
+ int String2Id(string input);
+ vector<string> Id2Punc(vector<int> input);
+ string Id2Punc(int n_punc_id);
+ vector<int> Punc2Ids(vector<string> input);
+ vector<string> SplitChineseString(const string& str_info);
+ void StrSplit(const string& str, const char split, vector<string>& res);
+ void Tokenize(const char* str_info, vector<string>& str_out, vector<int>& id_out);
+
+};
--
Gitblit v1.9.1