From b9bcf1f093c3053fdc4e2cf4a1d38e27bbf429fb Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 19 十月 2023 14:03:48 +0800
Subject: [PATCH] docs

---
 funasr/runtime/onnxruntime/src/tokenizer.cpp |   73 +++++++++++++++++++++++++++++++++++-
 1 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/funasr/runtime/onnxruntime/src/tokenizer.cpp b/funasr/runtime/onnxruntime/src/tokenizer.cpp
index a8f6301..a111b91 100644
--- a/funasr/runtime/onnxruntime/src/tokenizer.cpp
+++ b/funasr/runtime/onnxruntime/src/tokenizer.cpp
@@ -17,6 +17,41 @@
 
 CTokenizer::~CTokenizer()
 {
+	delete jieba_dict_trie_;
+    delete jieba_model_;
+}
+
+void CTokenizer::SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm) {
+	jieba_processor_.SetJiebaRes(dict, hmm);
+}
+
+void CTokenizer::JiebaInit(std::string punc_config){
+    if (seg_jieba){
+        std::string model_path = punc_config.substr(0, punc_config.length() - (sizeof(PUNC_CONFIG_NAME)-1));
+        std::string jieba_dict_file = PathAppend(model_path, JIEBA_DICT);
+        std::string jieba_hmm_file = PathAppend(model_path, JIEBA_HMM_MODEL);
+        std::string jieba_userdict_file = PathAppend(model_path, JIEBA_USERDICT);
+		try{
+        	jieba_dict_trie_ = new cppjieba::DictTrie(jieba_dict_file, jieba_userdict_file);
+			LOG(INFO) << "Successfully load file from " << jieba_dict_file << ", " << jieba_userdict_file;
+		}catch(exception const &e){
+			LOG(ERROR) << "Error loading file, Jieba dict file error or not exist.";
+			exit(-1);
+		}
+
+		try{
+        	jieba_model_ = new cppjieba::HMMModel(jieba_hmm_file);
+			LOG(INFO) << "Successfully load model from " << jieba_hmm_file;
+		}catch(exception const &e){
+			LOG(ERROR) << "Error loading file, Jieba hmm file error or not exist.";
+			exit(-1);
+		}
+
+        SetJiebaRes(jieba_dict_trie_, jieba_model_);
+    }else {
+        jieba_dict_trie_ = NULL;
+        jieba_model_ = NULL;
+    }
 }
 
 void CTokenizer::ReadYaml(const YAML::Node& node) 
@@ -50,6 +85,11 @@
 
 	try
 	{
+		YAML::Node conf_seg_jieba = m_Config["seg_jieba"];
+        if (conf_seg_jieba.IsDefined()){
+            seg_jieba = conf_seg_jieba.as<bool>();
+        }
+
 		auto Tokens = m_Config["token_list"];
 		if (Tokens.IsSequence())
 		{
@@ -142,6 +182,14 @@
 	return result;
 }
 
+bool CTokenizer::IsPunc(string& Punc)
+{
+	if (m_punc2id.find(Punc) != m_punc2id.end())
+		return true;
+	else
+		return false;
+}
+
 vector<string> CTokenizer::SplitChineseString(const string & str_info)
 {
 	vector<string> list;
@@ -156,6 +204,14 @@
 		list.push_back(str_info.substr(i, len));
 		i += len;
 	}
+	return list;
+}
+
+vector<string> CTokenizer::SplitChineseJieba(const string & str_info)
+{
+	vector<string> list;
+	jieba_processor_.Cut(str_info, list, false);
+
 	return list;
 }
 
@@ -176,7 +232,7 @@
 	}
 }
 
- void CTokenizer::Tokenize(const char* str_info, vector<string> & str_out, vector<int> & id_out)
+void CTokenizer::Tokenize(const char* str_info, vector<string> & str_out, vector<int> & id_out)
 {
 	vector<string>  strList;
 	StrSplit(str_info,' ', strList);
@@ -192,7 +248,12 @@
 				if (current_chinese.size() > 0)
 				{
 					// for utf-8 chinese
-					auto chineseList = SplitChineseString(current_chinese);
+					vector<string> chineseList;
+					if(seg_jieba){
+						chineseList = SplitChineseJieba(current_chinese);
+					}else{
+						chineseList = SplitChineseString(current_chinese);
+					}
 					str_out.insert(str_out.end(), chineseList.begin(),chineseList.end());
 					current_chinese = "";
 				}
@@ -210,7 +271,13 @@
 		}
 		if (current_chinese.size() > 0)
 		{
-			auto chineseList = SplitChineseString(current_chinese);
+			// for utf-8 chinese
+			vector<string> chineseList;
+			if(seg_jieba){
+				chineseList = SplitChineseJieba(current_chinese);
+			}else{
+				chineseList = SplitChineseString(current_chinese);
+			}
 			str_out.insert(str_out.end(), chineseList.begin(), chineseList.end());
 			current_chinese = "";
 		}

--
Gitblit v1.9.1