From 91231a03f5c16fff0d9d54f859c7a9aa02fd239c Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期一, 16 十月 2023 14:47:17 +0800
Subject: [PATCH] add jieba for ct-transformer
---
funasr/runtime/onnxruntime/src/ct-transformer.cpp | 1
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp | 274 +++
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp | 77 +
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp | 141 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp | 405 +++++
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp | 54
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp | 46
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BlockingQueue.hpp | 49
funasr/runtime/onnxruntime/src/tokenizer.cpp | 65
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp | 95 +
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp | 144 +
funasr/runtime/onnxruntime/src/tokenizer.h | 11
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp | 65
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp | 23
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp | 200 ++
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp | 31
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp | 74 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp | 21
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp | 227 +++
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp | 154 ++
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp | 157 ++
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp | 67
funasr/runtime/onnxruntime/CMakeLists.txt | 2
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp | 44
funasr/runtime/onnxruntime/include/com-define.h | 4
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp | 139 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp | 206 ++
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp | 103 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp | 51
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp | 197 ++
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Logging.hpp | 90 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp | 86 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp | 70
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp | 129 +
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp | 190 ++
funasr/runtime/websocket/CMakeLists.txt | 2
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp | 38
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp | 113 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp | 411 +++++
funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp | 102 +
funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp | 7
41 files changed, 4,362 insertions(+), 3 deletions(-)
diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt
index b9a26b2..64d1c82 100644
--- a/funasr/runtime/onnxruntime/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/CMakeLists.txt
@@ -32,6 +32,8 @@
include_directories(${PROJECT_SOURCE_DIR}/third_party/kaldi-native-fbank)
include_directories(${PROJECT_SOURCE_DIR}/third_party/yaml-cpp/include)
+include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include)
+include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include/limonp/include)
if(ENABLE_GLOG)
include_directories(${PROJECT_SOURCE_DIR}/third_party/glog/src)
diff --git a/funasr/runtime/onnxruntime/include/com-define.h b/funasr/runtime/onnxruntime/include/com-define.h
index 9f28e15..0ecb9c2 100644
--- a/funasr/runtime/onnxruntime/include/com-define.h
+++ b/funasr/runtime/onnxruntime/include/com-define.h
@@ -107,4 +107,8 @@
#define DUN_INDEX 5
#define CACHE_POP_TRIGGER_LIMIT 200
+#define JIEBA_DICT "jieba.c.dict"
+#define JIEBA_USERDICT "jieba_usr_dict"
+#define JIEBA_HMM_MODEL "jieba.hmm"
+
} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/ct-transformer.cpp b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
index 71a8847..8f8d953 100644
--- a/funasr/runtime/onnxruntime/src/ct-transformer.cpp
+++ b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
@@ -40,6 +40,7 @@
m_szOutputNames.push_back(item.c_str());
m_tokenizer.OpenYaml(punc_config.c_str());
+ m_tokenizer.JiebaInit(punc_config);
}
CTTransformer::~CTTransformer()
diff --git a/funasr/runtime/onnxruntime/src/tokenizer.cpp b/funasr/runtime/onnxruntime/src/tokenizer.cpp
index cd3f027..a111b91 100644
--- a/funasr/runtime/onnxruntime/src/tokenizer.cpp
+++ b/funasr/runtime/onnxruntime/src/tokenizer.cpp
@@ -17,6 +17,41 @@
CTokenizer::~CTokenizer()
{
+ delete jieba_dict_trie_;
+ delete jieba_model_;
+}
+
+void CTokenizer::SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm) {
+ jieba_processor_.SetJiebaRes(dict, hmm);
+}
+
+void CTokenizer::JiebaInit(std::string punc_config){
+ if (seg_jieba){
+ std::string model_path = punc_config.substr(0, punc_config.length() - (sizeof(PUNC_CONFIG_NAME)-1));
+ std::string jieba_dict_file = PathAppend(model_path, JIEBA_DICT);
+ std::string jieba_hmm_file = PathAppend(model_path, JIEBA_HMM_MODEL);
+ std::string jieba_userdict_file = PathAppend(model_path, JIEBA_USERDICT);
+ try{
+ jieba_dict_trie_ = new cppjieba::DictTrie(jieba_dict_file, jieba_userdict_file);
+ LOG(INFO) << "Successfully load file from " << jieba_dict_file << ", " << jieba_userdict_file;
+ }catch(exception const &e){
+ LOG(ERROR) << "Error loading file, Jieba dict file error or not exist.";
+ exit(-1);
+ }
+
+ try{
+ jieba_model_ = new cppjieba::HMMModel(jieba_hmm_file);
+ LOG(INFO) << "Successfully load model from " << jieba_hmm_file;
+ }catch(exception const &e){
+ LOG(ERROR) << "Error loading file, Jieba hmm file error or not exist.";
+ exit(-1);
+ }
+
+ SetJiebaRes(jieba_dict_trie_, jieba_model_);
+ }else {
+ jieba_dict_trie_ = NULL;
+ jieba_model_ = NULL;
+ }
}
void CTokenizer::ReadYaml(const YAML::Node& node)
@@ -50,6 +85,11 @@
try
{
+ YAML::Node conf_seg_jieba = m_Config["seg_jieba"];
+ if (conf_seg_jieba.IsDefined()){
+ seg_jieba = conf_seg_jieba.as<bool>();
+ }
+
auto Tokens = m_Config["token_list"];
if (Tokens.IsSequence())
{
@@ -167,6 +207,14 @@
return list;
}
+vector<string> CTokenizer::SplitChineseJieba(const string & str_info)
+{
+ vector<string> list;
+ jieba_processor_.Cut(str_info, list, false);
+
+ return list;
+}
+
void CTokenizer::StrSplit(const string& str, const char split, vector<string>& res)
{
if (str == "")
@@ -184,7 +232,7 @@
}
}
- void CTokenizer::Tokenize(const char* str_info, vector<string> & str_out, vector<int> & id_out)
+void CTokenizer::Tokenize(const char* str_info, vector<string> & str_out, vector<int> & id_out)
{
vector<string> strList;
StrSplit(str_info,' ', strList);
@@ -200,7 +248,12 @@
if (current_chinese.size() > 0)
{
// for utf-8 chinese
- auto chineseList = SplitChineseString(current_chinese);
+ vector<string> chineseList;
+ if(seg_jieba){
+ chineseList = SplitChineseJieba(current_chinese);
+ }else{
+ chineseList = SplitChineseString(current_chinese);
+ }
str_out.insert(str_out.end(), chineseList.begin(),chineseList.end());
current_chinese = "";
}
@@ -218,7 +271,13 @@
}
if (current_chinese.size() > 0)
{
- auto chineseList = SplitChineseString(current_chinese);
+ // for utf-8 chinese
+ vector<string> chineseList;
+ if(seg_jieba){
+ chineseList = SplitChineseJieba(current_chinese);
+ }else{
+ chineseList = SplitChineseString(current_chinese);
+ }
str_out.insert(str_out.end(), chineseList.begin(), chineseList.end());
current_chinese = "";
}
diff --git a/funasr/runtime/onnxruntime/src/tokenizer.h b/funasr/runtime/onnxruntime/src/tokenizer.h
index 3b1d1c5..149161b 100644
--- a/funasr/runtime/onnxruntime/src/tokenizer.h
+++ b/funasr/runtime/onnxruntime/src/tokenizer.h
@@ -5,6 +5,9 @@
#pragma once
#include <yaml-cpp/yaml.h>
+#include "cppjieba/DictTrie.hpp"
+#include "cppjieba/HMMModel.hpp"
+#include "cppjieba/Jieba.hpp"
namespace funasr {
class CTokenizer {
@@ -13,6 +16,10 @@
bool m_ready = false;
vector<string> m_id2token,m_id2punc;
map<string, int> m_token2id,m_punc2id;
+
+ cppjieba::DictTrie *jieba_dict_trie_;
+ cppjieba::HMMModel *jieba_model_;
+ cppjieba::Jieba jieba_processor_;
public:
@@ -28,9 +35,13 @@
string Id2Punc(int n_punc_id);
vector<int> Punc2Ids(vector<string> input);
vector<string> SplitChineseString(const string& str_info);
+ vector<string> SplitChineseJieba(const string& str_info);
void StrSplit(const string& str, const char split, vector<string>& res);
void Tokenize(const char* str_info, vector<string>& str_out, vector<int>& id_out);
bool IsPunc(string& Punc);
+ bool seg_jieba = false;
+ void SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm);
+ void JiebaInit(std::string punc_config);
};
} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp
new file mode 100644
index 0000000..c219a01
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp
@@ -0,0 +1,274 @@
+#ifndef CPPJIEBA_DICT_TRIE_HPP
+#define CPPJIEBA_DICT_TRIE_HPP
+
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <string>
+#include <cstring>
+#include <cstdlib>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include "limonp/StringUtil.hpp"
+#include "limonp/Logging.hpp"
+#include "Unicode.hpp"
+#include "Trie.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+
+const double MIN_DOUBLE = -3.14e+100;
+const double MAX_DOUBLE = 3.14e+100;
+const size_t DICT_COLUMN_NUM = 3;
+const char* const UNKNOWN_TAG = "";
+
+class DictTrie {
+ public:
+ enum UserWordWeightOption {
+ WordWeightMin,
+ WordWeightMedian,
+ WordWeightMax,
+ }; // enum UserWordWeightOption
+
+ DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) {
+ Init(dict_path, user_dict_paths, user_word_weight_opt);
+ }
+ ~DictTrie() {
+ delete trie_;
+ }
+
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+ DictUnit node_info;
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
+ return false;
+ }
+ active_node_infos_.push_back(node_info);
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
+ return true;
+ }
+
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+ DictUnit node_info;
+ double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ;
+ if (!MakeNodeInfo(node_info, word, weight , tag)) {
+ return false;
+ }
+ active_node_infos_.push_back(node_info);
+ trie_->InsertNode(node_info.word, &active_node_infos_.back());
+ return true;
+ }
+
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+ DictUnit node_info;
+ if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) {
+ return false;
+ }
+ trie_->DeleteNode(node_info.word, &node_info);
+ return true;
+ }
+
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+ return trie_->Find(begin, end);
+ }
+
+ void Find(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ vector<struct Dag>&res,
+ size_t max_word_len = MAX_WORD_LENGTH) const {
+ trie_->Find(begin, end, res, max_word_len);
+ }
+
+ bool Find(const string& word)
+ {
+ const DictUnit *tmp = NULL;
+ RuneStrArray runes;
+ if (!DecodeRunesInString(word, runes))
+ {
+ XLOG(ERROR) << "Decode failed.";
+ }
+ tmp = Find(runes.begin(), runes.end());
+ if (tmp == NULL)
+ {
+ return false;
+ }
+ else
+ {
+ return true;
+ }
+ }
+
+ bool IsUserDictSingleChineseWord(const Rune& word) const {
+ return IsIn(user_dict_single_chinese_word_, word);
+ }
+
+ double GetMinWeight() const {
+ return min_weight_;
+ }
+
+ void InserUserDictNode(const string& line) {
+ vector<string> buf;
+ DictUnit node_info;
+ Split(line, buf, " ");
+ if(buf.size() == 1){
+ MakeNodeInfo(node_info,
+ buf[0],
+ user_word_default_weight_,
+ UNKNOWN_TAG);
+ } else if (buf.size() == 2) {
+ MakeNodeInfo(node_info,
+ buf[0],
+ user_word_default_weight_,
+ buf[1]);
+ } else if (buf.size() == 3) {
+ int freq = atoi(buf[1].c_str());
+ assert(freq_sum_ > 0.0);
+ double weight = log(1.0 * freq / freq_sum_);
+ MakeNodeInfo(node_info, buf[0], weight, buf[2]);
+ }
+ static_node_infos_.push_back(node_info);
+ if (node_info.word.size() == 1) {
+ user_dict_single_chinese_word_.insert(node_info.word[0]);
+ }
+ }
+
+ void LoadUserDict(const vector<string>& buf) {
+ for (size_t i = 0; i < buf.size(); i++) {
+ InserUserDictNode(buf[i]);
+ }
+ }
+
+ void LoadUserDict(const set<string>& buf) {
+ std::set<string>::const_iterator iter;
+ for (iter = buf.begin(); iter != buf.end(); iter++){
+ InserUserDictNode(*iter);
+ }
+ }
+
+ void LoadUserDict(const string& filePaths) {
+ vector<string> files = limonp::Split(filePaths, "|;");
+ size_t lineno = 0;
+ for (size_t i = 0; i < files.size(); i++) {
+ ifstream ifs(files[i].c_str());
+ XCHECK(ifs.is_open()) << "open " << files[i] << " failed";
+ string line;
+
+ for (; getline(ifs, line); lineno++) {
+ if (line.size() == 0) {
+ continue;
+ }
+ InserUserDictNode(line);
+ }
+ }
+ }
+
+
+ private:
+ void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) {
+ LoadDict(dict_path);
+ Shrink(static_node_infos_);
+ CreateTrie(static_node_infos_);
+ }
+
+ void CreateTrie(const vector<DictUnit>& dictUnits) {
+ assert(dictUnits.size());
+ vector<Unicode> words;
+ vector<const DictUnit*> valuePointers;
+ for (size_t i = 0 ; i < dictUnits.size(); i ++) {
+ words.push_back(dictUnits[i].word);
+ valuePointers.push_back(&dictUnits[i]);
+ }
+ trie_ = new Trie(words, valuePointers);
+ }
+
+ bool MakeNodeInfo(DictUnit& node_info,
+ const string& word,
+ double weight,
+ const string& tag) {
+ if (!DecodeRunesInString(word, node_info.word)) {
+ XLOG(ERROR) << "Decode " << word << " failed.";
+ return false;
+ }
+ node_info.weight = weight;
+ node_info.tag = tag;
+ return true;
+ }
+
+ void LoadDict(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed.";
+ string line;
+ vector<string> buf;
+
+ DictUnit node_info;
+ for (size_t lineno = 0; getline(ifs, line); lineno++) {
+ Split(line, buf, " ");
+ XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line;
+ MakeNodeInfo(node_info,
+ buf[0],
+ atof(buf[1].c_str()),
+ buf[2]);
+ static_node_infos_.push_back(node_info);
+ }
+ }
+
+ static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) {
+ return lhs.weight < rhs.weight;
+ }
+
+ void SetStaticWordWeights(UserWordWeightOption option) {
+ XCHECK(!static_node_infos_.empty());
+ vector<DictUnit> x = static_node_infos_;
+ sort(x.begin(), x.end(), WeightCompare);
+ min_weight_ = x[0].weight;
+ max_weight_ = x[x.size() - 1].weight;
+ median_weight_ = x[x.size() / 2].weight;
+ switch (option) {
+ case WordWeightMin:
+ user_word_default_weight_ = min_weight_;
+ break;
+ case WordWeightMedian:
+ user_word_default_weight_ = median_weight_;
+ break;
+ default:
+ user_word_default_weight_ = max_weight_;
+ break;
+ }
+ }
+
+ double CalcFreqSum(const vector<DictUnit>& node_infos) const {
+ double sum = 0.0;
+ for (size_t i = 0; i < node_infos.size(); i++) {
+ sum += node_infos[i].weight;
+ }
+ return sum;
+ }
+
+ void CalculateWeight(vector<DictUnit>& node_infos, double sum) const {
+ assert(sum > 0.0);
+ for (size_t i = 0; i < node_infos.size(); i++) {
+ DictUnit& node_info = node_infos[i];
+ assert(node_info.weight > 0.0);
+ node_info.weight = log(double(node_info.weight)/sum);
+ }
+ }
+
+ void Shrink(vector<DictUnit>& units) const {
+ vector<DictUnit>(units.begin(), units.end()).swap(units);
+ }
+
+ vector<DictUnit> static_node_infos_;
+ deque<DictUnit> active_node_infos_; // must not be vector
+ Trie * trie_;
+
+ double freq_sum_;
+ double min_weight_;
+ double max_weight_;
+ double median_weight_;
+ double user_word_default_weight_;
+ unordered_set<Rune> user_dict_single_chinese_word_;
+};
+}
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp
new file mode 100644
index 0000000..2295ceb
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp
@@ -0,0 +1,102 @@
+#ifndef CPPJIEBA_FULLSEGMENT_H
+#define CPPJIEBA_FULLSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentBase.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class FullSegment: public SegmentBase {
+ public:
+ FullSegment(const string& dictPath) {
+ dictTrie_ = new DictTrie(dictPath);
+ isNeedDestroy_ = true;
+ }
+ FullSegment(const DictTrie* dictTrie)
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
+ assert(dictTrie_);
+ }
+ FullSegment() {
+ dictTrie_ = NULL;
+ }
+ ~FullSegment() {
+ if (isNeedDestroy_) {
+ delete dictTrie_;
+ }
+ }
+
+ void setRes(DictTrie *&dictTrie) {
+ dictTrie_ = dictTrie;
+ isNeedDestroy_ = false;
+ assert(dictTrie_);
+ }
+ void Cut(const string& sentence,
+ vector<string>& words) const {
+ vector<Word> tmp;
+ Cut(sentence, tmp);
+ GetStringsFromWords(tmp, words);
+ }
+ void Cut(const string& sentence,
+ vector<Word>& words) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ GetWordsFromWordRanges(sentence, wrs, words);
+ }
+ void Cut(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ vector<WordRange>& res) const {
+ // result of searching in trie tree
+ LocalVector<pair<size_t, const DictUnit*> > tRes;
+
+ // max index of res's words
+ size_t maxIdx = 0;
+
+ // always equals to (uItr - begin)
+ size_t uIdx = 0;
+
+ // tmp variables
+ size_t wordLen = 0;
+ assert(dictTrie_);
+ vector<struct Dag> dags;
+ dictTrie_->Find(begin, end, dags);
+ for (size_t i = 0; i < dags.size(); i++) {
+ for (size_t j = 0; j < dags[i].nexts.size(); j++) {
+ size_t nextoffset = dags[i].nexts[j].first;
+ assert(nextoffset < dags.size());
+ const DictUnit* du = dags[i].nexts[j].second;
+ if (du == NULL) {
+ if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) {
+ WordRange wr(begin + i, begin + nextoffset);
+ res.push_back(wr);
+ }
+ } else {
+ wordLen = du->word.size();
+ if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) {
+ WordRange wr(begin + i, begin + nextoffset);
+ res.push_back(wr);
+ }
+ }
+ maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx;
+ }
+ uIdx++;
+ }
+ }
+ private:
+ const DictTrie* dictTrie_;
+ bool isNeedDestroy_;
+};
+}
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp
new file mode 100644
index 0000000..27e6b66
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp
@@ -0,0 +1,129 @@
+#ifndef CPPJIEBA_HMMMODEL_H
+#define CPPJIEBA_HMMMODEL_H
+
+#include "limonp/StringUtil.hpp"
+#include "Trie.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+typedef unordered_map<Rune, double> EmitProbMap;
+
+struct HMMModel {
+ /*
+ * STATUS:
+ * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S
+ * */
+ enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4};
+
+ HMMModel(const string& modelPath) {
+ memset(startProb, 0, sizeof(startProb));
+ memset(transProb, 0, sizeof(transProb));
+ statMap[0] = 'B';
+ statMap[1] = 'E';
+ statMap[2] = 'M';
+ statMap[3] = 'S';
+ emitProbVec.push_back(&emitProbB);
+ emitProbVec.push_back(&emitProbE);
+ emitProbVec.push_back(&emitProbM);
+ emitProbVec.push_back(&emitProbS);
+ LoadModel(modelPath);
+ }
+ ~HMMModel() {
+ }
+ void LoadModel(const string& filePath) {
+ ifstream ifile(filePath.c_str());
+ XCHECK(ifile.is_open()) << "open " << filePath << " failed";
+ string line;
+ vector<string> tmp;
+ vector<string> tmp2;
+ //Load startProb
+ XCHECK(GetLine(ifile, line));
+ Split(line, tmp, " ");
+ XCHECK(tmp.size() == STATUS_SUM);
+ for (size_t j = 0; j< tmp.size(); j++) {
+ startProb[j] = atof(tmp[j].c_str());
+ }
+
+ //Load transProb
+ for (size_t i = 0; i < STATUS_SUM; i++) {
+ XCHECK(GetLine(ifile, line));
+ Split(line, tmp, " ");
+ XCHECK(tmp.size() == STATUS_SUM);
+ for (size_t j =0; j < STATUS_SUM; j++) {
+ transProb[i][j] = atof(tmp[j].c_str());
+ }
+ }
+
+ //Load emitProbB
+ XCHECK(GetLine(ifile, line));
+ XCHECK(LoadEmitProb(line, emitProbB));
+
+ //Load emitProbE
+ XCHECK(GetLine(ifile, line));
+ XCHECK(LoadEmitProb(line, emitProbE));
+
+ //Load emitProbM
+ XCHECK(GetLine(ifile, line));
+ XCHECK(LoadEmitProb(line, emitProbM));
+
+ //Load emitProbS
+ XCHECK(GetLine(ifile, line));
+ XCHECK(LoadEmitProb(line, emitProbS));
+ }
+ double GetEmitProb(const EmitProbMap* ptMp, Rune key,
+ double defVal)const {
+ EmitProbMap::const_iterator cit = ptMp->find(key);
+ if (cit == ptMp->end()) {
+ return defVal;
+ }
+ return cit->second;
+ }
+ bool GetLine(ifstream& ifile, string& line) {
+ while (getline(ifile, line)) {
+ Trim(line);
+ if (line.empty()) {
+ continue;
+ }
+ if (StartsWith(line, "#")) {
+ continue;
+ }
+ return true;
+ }
+ return false;
+ }
+ bool LoadEmitProb(const string& line, EmitProbMap& mp) {
+ if (line.empty()) {
+ return false;
+ }
+ vector<string> tmp, tmp2;
+ Unicode unicode;
+ Split(line, tmp, ",");
+ for (size_t i = 0; i < tmp.size(); i++) {
+ Split(tmp[i], tmp2, ":");
+ if (2 != tmp2.size()) {
+ XLOG(ERROR) << "emitProb illegal.";
+ return false;
+ }
+ if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) {
+ XLOG(ERROR) << "TransCode failed.";
+ return false;
+ }
+ mp[unicode[0]] = atof(tmp2[1].c_str());
+ }
+ return true;
+ }
+
+ char statMap[STATUS_SUM];
+ double startProb[STATUS_SUM];
+ double transProb[STATUS_SUM][STATUS_SUM];
+ EmitProbMap emitProbB;
+ EmitProbMap emitProbE;
+ EmitProbMap emitProbM;
+ EmitProbMap emitProbS;
+ vector<EmitProbMap* > emitProbVec;
+}; // struct HMMModel
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp
new file mode 100644
index 0000000..91a311d
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp
@@ -0,0 +1,197 @@
+#ifndef CPPJIBEA_HMMSEGMENT_H
+#define CPPJIBEA_HMMSEGMENT_H
+
+#include <iostream>
+#include <fstream>
+#include <memory.h>
+#include <cassert>
+#include "HMMModel.hpp"
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+class HMMSegment: public SegmentBase {
+ public:
+ HMMSegment(const string& filePath)
+ : model_(new HMMModel(filePath)), isNeedDestroy_(true) {
+ }
+ HMMSegment(const HMMModel* model)
+ : model_(model), isNeedDestroy_(false) {
+ }
+ HMMSegment() {
+ model_ = NULL;
+ }
+
+ ~HMMSegment() {
+ if (isNeedDestroy_) {
+ delete model_;
+ }
+ }
+ void setRes(HMMModel *&model) {
+ model_ = model;
+ isNeedDestroy_ = false;
+ }
+ void Cut(const string& sentence,
+ vector<string>& words) const {
+ vector<Word> tmp;
+ Cut(sentence, tmp);
+ GetStringsFromWords(tmp, words);
+ }
+ void Cut(const string& sentence,
+ vector<Word>& words) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ GetWordsFromWordRanges(sentence, wrs, words);
+ }
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+ RuneStrArray::const_iterator left = begin;
+ RuneStrArray::const_iterator right = begin;
+ while (right != end) {
+ if (right->rune < 0x80) {
+ if (left != right) {
+ InternalCut(left, right, res);
+ }
+ left = right;
+ do {
+ right = SequentialLetterRule(left, end);
+ if (right != left) {
+ break;
+ }
+ right = NumbersRule(left, end);
+ if (right != left) {
+ break;
+ }
+ right ++;
+ } while (false);
+ WordRange wr(left, right - 1);
+ res.push_back(wr);
+ left = right;
+ } else {
+ right++;
+ }
+ }
+ if (left != right) {
+ InternalCut(left, right, res);
+ }
+ }
+ private:
+ // sequential letters rule
+ RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+ Rune x = begin->rune;
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) {
+ begin ++;
+ } else {
+ return begin;
+ }
+ while (begin != end) {
+ x = begin->rune;
+ if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) {
+ begin ++;
+ } else {
+ break;
+ }
+ }
+ return begin;
+ }
+ //
+ RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+ Rune x = begin->rune;
+ if ('0' <= x && x <= '9') {
+ begin ++;
+ } else {
+ return begin;
+ }
+ while (begin != end) {
+ x = begin->rune;
+ if ( ('0' <= x && x <= '9') || x == '.') {
+ begin++;
+ } else {
+ break;
+ }
+ }
+ return begin;
+ }
+ void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res) const {
+ vector<size_t> status;
+ Viterbi(begin, end, status);
+
+ RuneStrArray::const_iterator left = begin;
+ RuneStrArray::const_iterator right;
+ for (size_t i = 0; i < status.size(); i++) {
+ if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i])
+ right = begin + i + 1;
+ WordRange wr(left, right - 1);
+ res.push_back(wr);
+ left = right;
+ }
+ }
+ }
+
+ void Viterbi(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ vector<size_t>& status) const {
+ size_t Y = HMMModel::STATUS_SUM;
+ size_t X = end - begin;
+
+ size_t XYSize = X * Y;
+ size_t now, old, stat;
+ double tmp, endE, endS;
+
+ vector<int> path(XYSize);
+ vector<double> weight(XYSize);
+
+ //start
+ for (size_t y = 0; y < Y; y++) {
+ weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE);
+ path[0 + y * X] = -1;
+ }
+
+ double emitProb;
+
+ for (size_t x = 1; x < X; x++) {
+ for (size_t y = 0; y < Y; y++) {
+ now = x + y*X;
+ weight[now] = MIN_DOUBLE;
+ path[now] = HMMModel::E; // warning
+ emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE);
+ for (size_t preY = 0; preY < Y; preY++) {
+ old = x - 1 + preY * X;
+ tmp = weight[old] + model_->transProb[preY][y] + emitProb;
+ if (tmp > weight[now]) {
+ weight[now] = tmp;
+ path[now] = preY;
+ }
+ }
+ }
+ }
+
+ endE = weight[X-1+HMMModel::E*X];
+ endS = weight[X-1+HMMModel::S*X];
+ stat = 0;
+ if (endE >= endS) {
+ stat = HMMModel::E;
+ } else {
+ stat = HMMModel::S;
+ }
+
+ status.resize(X);
+ for (int x = X -1 ; x >= 0; x--) {
+ status[x] = stat;
+ stat = path[x + stat*X];
+ }
+ }
+
+ const HMMModel* model_;
+ bool isNeedDestroy_;
+}; // class HMMSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp
new file mode 100644
index 0000000..0e778f9
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp
@@ -0,0 +1,141 @@
+#ifndef CPPJIEAB_JIEBA_H
+#define CPPJIEAB_JIEBA_H
+
+#include "QuerySegment.hpp"
+#include "KeywordExtractor.hpp"
+
+namespace cppjieba {
+
+class Jieba {
+ public:
+ Jieba(DictTrie *jieba_dict_trie,
+ HMMModel *jieba_model)
+ : dict_trie_(jieba_dict_trie),
+ model_(jieba_model),
+ mp_seg_(dict_trie_),
+ hmm_seg_(model_),
+ mix_seg_(dict_trie_, model_),
+ full_seg_(dict_trie_),
+ query_seg_(dict_trie_, model_) {
+ }
+ Jieba() {
+ dict_trie_ = NULL;
+ model_ = NULL;
+ }
+ ~Jieba() {
+ }
+
+ struct LocWord {
+ string word;
+ size_t begin;
+ size_t end;
+ }; // struct LocWord
+ void SetJiebaRes(cppjieba::DictTrie *&dict, cppjieba::HMMModel *&hmm) {
+ dict_trie_ = dict;
+ model_ = hmm;
+ mp_seg_.setRes(dict);
+ hmm_seg_.setRes(hmm);
+ mix_seg_.setRes(dict, hmm);
+ full_seg_.setRes(dict);
+ query_seg_.setRes(dict, hmm);
+ }
+ void Cut(const string& sentence, vector<string>& words, bool hmm = true) const {
+ mix_seg_.Cut(sentence, words, hmm);
+ }
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+ mix_seg_.Cut(sentence, words, hmm);
+ }
+ void CutAll(const string& sentence, vector<string>& words) const {
+ full_seg_.Cut(sentence, words);
+ }
+ void CutAll(const string& sentence, vector<Word>& words) const {
+ full_seg_.Cut(sentence, words);
+ }
+ void CutForSearch(const string& sentence, vector<string>& words, bool hmm = true) const {
+ query_seg_.Cut(sentence, words, hmm);
+ }
+ void CutForSearch(const string& sentence, vector<Word>& words, bool hmm = true) const {
+ query_seg_.Cut(sentence, words, hmm);
+ }
+ void CutHMM(const string& sentence, vector<string>& words) const {
+ hmm_seg_.Cut(sentence, words);
+ }
+ void CutHMM(const string& sentence, vector<Word>& words) const {
+ hmm_seg_.Cut(sentence, words);
+ }
+ void CutSmall(const string& sentence, vector<string>& words, size_t max_word_len) const {
+ mp_seg_.Cut(sentence, words, max_word_len);
+ }
+ void CutSmall(const string& sentence, vector<Word>& words, size_t max_word_len) const {
+ mp_seg_.Cut(sentence, words, max_word_len);
+ }
+
+ void Tag(const string& sentence, vector<pair<string, string> >& words) const {
+ mix_seg_.Tag(sentence, words);
+ }
+ string LookupTag(const string &str) const {
+ return mix_seg_.LookupTag(str);
+ }
+ bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+ return dict_trie_->InsertUserWord(word, tag);
+ }
+
+ bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) {
+ return dict_trie_->InsertUserWord(word,freq, tag);
+ }
+
+ bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) {
+ return dict_trie_->DeleteUserWord(word, tag);
+ }
+
+ bool Find(const string& word)
+ {
+ return dict_trie_->Find(word);
+ }
+
+ void ResetSeparators(const string& s) {
+ //TODO
+ mp_seg_.ResetSeparators(s);
+ hmm_seg_.ResetSeparators(s);
+ mix_seg_.ResetSeparators(s);
+ full_seg_.ResetSeparators(s);
+ query_seg_.ResetSeparators(s);
+ }
+
+ const DictTrie* GetDictTrie() const {
+ return dict_trie_;
+ }
+
+ const HMMModel* GetHMMModel() const {
+ return model_;
+ }
+
+ void LoadUserDict(const vector<string>& buf) {
+ dict_trie_->LoadUserDict(buf);
+ }
+
+ void LoadUserDict(const set<string>& buf) {
+ dict_trie_->LoadUserDict(buf);
+ }
+
+ void LoadUserDict(const string& path) {
+ dict_trie_->LoadUserDict(path);
+ }
+
+ private:
+ DictTrie *dict_trie_;
+ HMMModel *model_;
+
+ // They share the same dict trie and model
+ MPSegment mp_seg_;
+ HMMSegment hmm_seg_;
+ MixSegment mix_seg_;
+ FullSegment full_seg_;
+ QuerySegment query_seg_;
+
+ public:
+}; // class Jieba
+
+} // namespace cppjieba
+
+#endif // CPPJIEAB_JIEBA_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp
new file mode 100644
index 0000000..15b50b9
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp
@@ -0,0 +1,154 @@
+#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H
+#define CPPJIEBA_KEYWORD_EXTRACTOR_H
+
+#include <cmath>
+#include <set>
+#include "MixSegment.hpp"
+
+namespace cppjieba {
+
+using namespace limonp;
+using namespace std;
+
+/*utf8*/
+class KeywordExtractor {
+ public:
+ struct Word {
+ string word;
+ vector<size_t> offsets;
+ double weight;
+ }; // struct Word
+
+ KeywordExtractor(const string& dictPath,
+ const string& hmmFilePath,
+ const string& idfPath,
+ const string& stopWordPath,
+ const string& userDict = "")
+ : segment_(dictPath, hmmFilePath, userDict) {
+ LoadIdfDict(idfPath);
+ LoadStopWordDict(stopWordPath);
+ }
+ KeywordExtractor(const DictTrie* dictTrie,
+ const HMMModel* model,
+ const string& idfPath,
+ const string& stopWordPath)
+ : segment_(dictTrie, model) {
+ LoadIdfDict(idfPath);
+ LoadStopWordDict(stopWordPath);
+ }
+ KeywordExtractor() {}
+ ~KeywordExtractor() {
+ }
+
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+ vector<Word> topWords;
+ Extract(sentence, topWords, topN);
+ for (size_t i = 0; i < topWords.size(); i++) {
+ keywords.push_back(topWords[i].word);
+ }
+ }
+
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+ vector<Word> topWords;
+ Extract(sentence, topWords, topN);
+ for (size_t i = 0; i < topWords.size(); i++) {
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+ }
+ }
+
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN) const {
+ vector<string> words;
+ segment_.Cut(sentence, words);
+
+ map<string, Word> wordmap;
+ size_t offset = 0;
+ for (size_t i = 0; i < words.size(); ++i) {
+ size_t t = offset;
+ offset += words[i].size();
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+ continue;
+ }
+ wordmap[words[i]].offsets.push_back(t);
+ wordmap[words[i]].weight += 1.0;
+ }
+ if (offset != sentence.size()) {
+ XLOG(ERROR) << "words illegal";
+ return;
+ }
+
+ keywords.clear();
+ keywords.reserve(wordmap.size());
+ for (map<string, Word>::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+ unordered_map<string, double>::const_iterator cit = idfMap_.find(itr->first);
+ if (cit != idfMap_.end()) {
+ itr->second.weight *= cit->second;
+ } else {
+ itr->second.weight *= idfAverage_;
+ }
+ itr->second.word = itr->first;
+ keywords.push_back(itr->second);
+ }
+ topN = min(topN, keywords.size());
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+ keywords.resize(topN);
+ }
+ private:
+ void LoadIdfDict(const string& idfPath) {
+ ifstream ifs(idfPath.c_str());
+ XCHECK(ifs.is_open()) << "open " << idfPath << " failed";
+ string line ;
+ vector<string> buf;
+ double idf = 0.0;
+ double idfSum = 0.0;
+ size_t lineno = 0;
+ for (; getline(ifs, line); lineno++) {
+ buf.clear();
+ if (line.empty()) {
+ XLOG(ERROR) << "lineno: " << lineno << " empty. skipped.";
+ continue;
+ }
+ Split(line, buf, " ");
+ if (buf.size() != 2) {
+ XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped.";
+ continue;
+ }
+ idf = atof(buf[1].c_str());
+ idfMap_[buf[0]] = idf;
+ idfSum += idf;
+
+ }
+
+ assert(lineno);
+ idfAverage_ = idfSum / lineno;
+ assert(idfAverage_ > 0.0);
+ }
+ void LoadStopWordDict(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+ string line ;
+ while (getline(ifs, line)) {
+ stopWords_.insert(line);
+ }
+ assert(stopWords_.size());
+ }
+
+ static bool Compare(const Word& lhs, const Word& rhs) {
+ return lhs.weight > rhs.weight;
+ }
+
+ MixSegment segment_;
+ unordered_map<string, double> idfMap_;
+ double idfAverage_;
+
+ unordered_set<string> stopWords_;
+}; // class KeywordExtractor
+
+inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) {
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
+}
+
+} // namespace cppjieba
+
+#endif
+
+
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp
new file mode 100644
index 0000000..524dcb0
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp
@@ -0,0 +1,144 @@
+#ifndef CPPJIEBA_MPSEGMENT_H
+#define CPPJIEBA_MPSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentTagged.hpp"
+#include "PosTagger.hpp"
+
+namespace cppjieba {
+
+class MPSegment: public SegmentTagged {
+ public:
+ MPSegment(const string& dictPath, const string& userDictPath = "")
+ : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) {
+ }
+ MPSegment(const DictTrie* dictTrie)
+ : dictTrie_(dictTrie), isNeedDestroy_(false) {
+ assert(dictTrie_);
+ }
+ MPSegment() {
+ dictTrie_ = NULL;
+ }
+ ~MPSegment() {
+ if (isNeedDestroy_) {
+ delete dictTrie_;
+ }
+ }
+ void setRes(DictTrie *&dictTrie) {
+ dictTrie_ = dictTrie;
+ isNeedDestroy_ = false;
+ assert(dictTrie_);
+ }
+ void Cut(const string& sentence, vector<string>& words) const {
+ Cut(sentence, words, MAX_WORD_LENGTH);
+ }
+
+ void Cut(const string& sentence,
+ vector<string>& words,
+ size_t max_word_len) const {
+ vector<Word> tmp;
+ Cut(sentence, tmp, max_word_len);
+ GetStringsFromWords(tmp, words);
+ }
+ void Cut(const string& sentence,
+ vector<Word>& words,
+ size_t max_word_len = MAX_WORD_LENGTH) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs, max_word_len);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ GetWordsFromWordRanges(sentence, wrs, words);
+ }
+ void Cut(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ vector<WordRange>& words,
+ size_t max_word_len = MAX_WORD_LENGTH) const {
+ vector<Dag> dags;
+ dictTrie_->Find(begin,
+ end,
+ dags,
+ max_word_len);
+ CalcDP(dags);
+ CutByDag(begin, end, dags, words);
+ }
+
+ const DictTrie* GetDictTrie() const {
+ return dictTrie_;
+ }
+
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
+ return tagger_.Tag(src, res, *this);
+ }
+
+ bool IsUserDictSingleChineseWord(const Rune& value) const {
+ return dictTrie_->IsUserDictSingleChineseWord(value);
+ }
+ private:
+ void CalcDP(vector<Dag>& dags) const {
+ size_t nextPos;
+ const DictUnit* p;
+ double val;
+
+ for (vector<Dag>::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) {
+ rit->pInfo = NULL;
+ rit->weight = MIN_DOUBLE;
+ assert(!rit->nexts.empty());
+ for (LocalVector<pair<size_t, const DictUnit*> >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) {
+ nextPos = it->first;
+ p = it->second;
+ val = 0.0;
+ if (nextPos + 1 < dags.size()) {
+ val += dags[nextPos + 1].weight;
+ }
+
+ if (p) {
+ val += p->weight;
+ } else {
+ val += dictTrie_->GetMinWeight();
+ }
+ if (val > rit->weight) {
+ rit->pInfo = p;
+ rit->weight = val;
+ }
+ }
+ }
+ }
+ void CutByDag(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ const vector<Dag>& dags,
+ vector<WordRange>& words) const {
+ size_t i = 0;
+ while (i < dags.size()) {
+ const DictUnit* p = dags[i].pInfo;
+ if (p) {
+ assert(p->word.size() >= 1);
+ WordRange wr(begin + i, begin + i + p->word.size() - 1);
+ words.push_back(wr);
+ i += p->word.size();
+ } else { //single chinese word
+ WordRange wr(begin + i, begin + i);
+ words.push_back(wr);
+ i++;
+ }
+ }
+ }
+
+ const DictTrie* dictTrie_;
+ bool isNeedDestroy_;
+ PosTagger tagger_;
+
+}; // class MPSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp
new file mode 100644
index 0000000..c058219
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp
@@ -0,0 +1,113 @@
+#ifndef CPPJIEBA_MIXSEGMENT_H
+#define CPPJIEBA_MIXSEGMENT_H
+
+#include <cassert>
+#include "MPSegment.hpp"
+#include "HMMSegment.hpp"
+#include "limonp/StringUtil.hpp"
+#include "PosTagger.hpp"
+
+namespace cppjieba {
+class MixSegment: public SegmentTagged {
+ public:
+ MixSegment(const string& mpSegDict, const string& hmmSegDict,
+ const string& userDict = "")
+ : mpSeg_(mpSegDict, userDict),
+ hmmSeg_(hmmSegDict) {
+ }
+ MixSegment(const DictTrie* dictTrie, const HMMModel* model)
+ : mpSeg_(dictTrie), hmmSeg_(model) {
+ }
+ MixSegment() {}
+ ~MixSegment() {
+ }
+ void setRes(DictTrie *&dictTrie, HMMModel *&model) {
+ mpSeg_.setRes(dictTrie);
+ hmmSeg_.setRes(model);
+ }
+ void Cut(const string& sentence, vector<string>& words) const {
+ Cut(sentence, words, true);
+ }
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
+ vector<Word> tmp;
+ Cut(sentence, tmp, hmm);
+ GetStringsFromWords(tmp, words);
+ }
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size() / 2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs, hmm);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ GetWordsFromWordRanges(sentence, wrs, words);
+ }
+
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
+ if (!hmm) {
+ mpSeg_.Cut(begin, end, res);
+ return;
+ }
+ vector<WordRange> words;
+ assert(end >= begin);
+ words.reserve(end - begin);
+ mpSeg_.Cut(begin, end, words);
+
+ vector<WordRange> hmmRes;
+ hmmRes.reserve(end - begin);
+ for (size_t i = 0; i < words.size(); i++) {
+ //if mp Get a word, it's ok, put it into result
+ if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) {
+ res.push_back(words[i]);
+ continue;
+ }
+
+ // if mp Get a single one and it is not in userdict, collect it in sequence
+ size_t j = i;
+ while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) {
+ j++;
+ }
+
+ // Cut the sequence with hmm
+ assert(j - 1 >= i);
+ // TODO
+ hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes);
+ //put hmm result to result
+ for (size_t k = 0; k < hmmRes.size(); k++) {
+ res.push_back(hmmRes[k]);
+ }
+
+ //clear tmp vars
+ hmmRes.clear();
+
+ //let i jump over this piece
+ i = j - 1;
+ }
+ }
+
+ const DictTrie* GetDictTrie() const {
+ return mpSeg_.GetDictTrie();
+ }
+
+ bool Tag(const string& src, vector<pair<string, string> >& res) const {
+ return tagger_.Tag(src, res, *this);
+ }
+
+ string LookupTag(const string &str) const {
+ return tagger_.LookupTag(str, *this);
+ }
+
+ private:
+ MPSegment mpSeg_;
+ HMMSegment hmmSeg_;
+ PosTagger tagger_;
+
+}; // class MixSegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp
new file mode 100644
index 0000000..78853d5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp
@@ -0,0 +1,77 @@
+#ifndef CPPJIEBA_POS_TAGGING_H
+#define CPPJIEBA_POS_TAGGING_H
+
+#include "limonp/StringUtil.hpp"
+#include "SegmentTagged.hpp"
+#include "DictTrie.hpp"
+
+namespace cppjieba {
+using namespace limonp;
+
+static const char* const POS_M = "m";
+static const char* const POS_ENG = "eng";
+static const char* const POS_X = "x";
+
+class PosTagger {
+ public:
+ PosTagger() {
+ }
+ ~PosTagger() {
+ }
+
+ bool Tag(const string& src, vector<pair<string, string> >& res, const SegmentTagged& segment) const {
+ vector<string> CutRes;
+ segment.Cut(src, CutRes);
+
+ for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) {
+ res.push_back(make_pair(*itr, LookupTag(*itr, segment)));
+ }
+ return !res.empty();
+ }
+
+ string LookupTag(const string &str, const SegmentTagged& segment) const {
+ const DictUnit *tmp = NULL;
+ RuneStrArray runes;
+ const DictTrie * dict = segment.GetDictTrie();
+ assert(dict != NULL);
+ if (!DecodeRunesInString(str, runes)) {
+ XLOG(ERROR) << "Decode failed.";
+ return POS_X;
+ }
+ tmp = dict->Find(runes.begin(), runes.end());
+ if (tmp == NULL || tmp->tag.empty()) {
+ return SpecialRule(runes);
+ } else {
+ return tmp->tag;
+ }
+ }
+
+ private:
+ const char* SpecialRule(const RuneStrArray& unicode) const {
+ size_t m = 0;
+ size_t eng = 0;
+ for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) {
+ if (unicode[i].rune < 0x80) {
+ eng ++;
+ if ('0' <= unicode[i].rune && unicode[i].rune <= '9') {
+ m++;
+ }
+ }
+ }
+ // ascii char is not found
+ if (eng == 0) {
+ return POS_X;
+ }
+ // all the ascii is number char
+ if (m == eng) {
+ return POS_M;
+ }
+ // the ascii chars contain english letter
+ return POS_ENG;
+ }
+
+}; // class PosTagger
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp
new file mode 100644
index 0000000..ecb81c0
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp
@@ -0,0 +1,54 @@
+#ifndef CPPJIEBA_PRE_FILTER_H
+#define CPPJIEBA_PRE_FILTER_H
+
+#include "Trie.hpp"
+#include "limonp/Logging.hpp"
+
+namespace cppjieba {
+
+class PreFilter {
+ public:
+ //TODO use WordRange instead of Range
+ struct Range {
+ RuneStrArray::const_iterator begin;
+ RuneStrArray::const_iterator end;
+ }; // struct Range
+
+ PreFilter(const unordered_set<Rune>& symbols,
+ const string& sentence)
+ : symbols_(symbols) {
+ if (!DecodeRunesInString(sentence, sentence_)) {
+ XLOG(ERROR) << "decode failed. ";
+ }
+ cursor_ = sentence_.begin();
+ }
+ ~PreFilter() {
+ }
+ bool HasNext() const {
+ return cursor_ != sentence_.end();
+ }
+ Range Next() {
+ Range range;
+ range.begin = cursor_;
+ while (cursor_ != sentence_.end()) {
+ if (IsIn(symbols_, cursor_->rune)) {
+ if (range.begin == cursor_) {
+ cursor_ ++;
+ }
+ range.end = cursor_;
+ return range;
+ }
+ cursor_ ++;
+ }
+ range.end = sentence_.end();
+ return range;
+ }
+ private:
+ RuneStrArray::const_iterator cursor_;
+ RuneStrArray sentence_;
+ const unordered_set<Rune>& symbols_;
+}; // class PreFilter
+
+} // namespace cppjieba
+
+#endif // CPPJIEBA_PRE_FILTER_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp
new file mode 100644
index 0000000..8ba7a9f
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp
@@ -0,0 +1,95 @@
+#ifndef CPPJIEBA_QUERYSEGMENT_H
+#define CPPJIEBA_QUERYSEGMENT_H
+
+#include <algorithm>
+#include <set>
+#include <cassert>
+#include "limonp/Logging.hpp"
+#include "DictTrie.hpp"
+#include "SegmentBase.hpp"
+#include "FullSegment.hpp"
+#include "MixSegment.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+class QuerySegment: public SegmentBase {
+ public:
+ QuerySegment(const string& dict, const string& model, const string& userDict = "")
+ : mixSeg_(dict, model, userDict),
+ trie_(mixSeg_.GetDictTrie()) {
+ }
+ QuerySegment(const DictTrie* dictTrie, const HMMModel* model)
+ : mixSeg_(dictTrie, model), trie_(dictTrie) {
+ }
+ QuerySegment() {
+ trie_ = NULL;
+ }
+ ~QuerySegment() {
+ }
+ void setRes(DictTrie *&dictTrie, HMMModel *&model) {
+ mixSeg_.setRes(dictTrie, model);
+ trie_ = dictTrie;
+ }
+ void Cut(const string& sentence, vector<string>& words) const {
+ Cut(sentence, words, true);
+ }
+ void Cut(const string& sentence, vector<string>& words, bool hmm) const {
+ vector<Word> tmp;
+ Cut(sentence, tmp, hmm);
+ GetStringsFromWords(tmp, words);
+ }
+ void Cut(const string& sentence, vector<Word>& words, bool hmm = true) const {
+ PreFilter pre_filter(symbols_, sentence);
+ PreFilter::Range range;
+ vector<WordRange> wrs;
+ wrs.reserve(sentence.size()/2);
+ while (pre_filter.HasNext()) {
+ range = pre_filter.Next();
+ Cut(range.begin, range.end, wrs, hmm);
+ }
+ words.clear();
+ words.reserve(wrs.size());
+ GetWordsFromWordRanges(sentence, wrs, words);
+ }
+ void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
+ //use mix Cut first
+ vector<WordRange> mixRes;
+ mixSeg_.Cut(begin, end, mixRes, hmm);
+
+ vector<WordRange> fullRes;
+ for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
+ if (mixResItr->Length() > 2) {
+ for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
+ res.push_back(wr);
+ }
+ }
+ }
+ if (mixResItr->Length() > 3) {
+ for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
+ WordRange wr(mixResItr->left + i, mixResItr->left + i + 2);
+ if (trie_->Find(wr.left, wr.right + 1) != NULL) {
+ res.push_back(wr);
+ }
+ }
+ }
+ res.push_back(*mixResItr);
+ }
+ }
+ private:
+ bool IsAllAscii(const Unicode& s) const {
+ for(size_t i = 0; i < s.size(); i++) {
+ if (s[i] >= 0x80) {
+ return false;
+ }
+ }
+ return true;
+ }
+ MixSegment mixSeg_;
+ const DictTrie* trie_;
+}; // QuerySegment
+
+} // namespace cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp
new file mode 100644
index 0000000..79c8009
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp
@@ -0,0 +1,46 @@
+#ifndef CPPJIEBA_SEGMENTBASE_H
+#define CPPJIEBA_SEGMENTBASE_H
+
+#include "limonp/Logging.hpp"
+#include "PreFilter.hpp"
+#include <cassert>
+
+
+namespace cppjieba {
+
+const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82";
+
+using namespace limonp;
+
+class SegmentBase {
+ public:
+ SegmentBase() {
+ XCHECK(ResetSeparators(SPECIAL_SEPARATORS));
+ }
+ virtual ~SegmentBase() {
+ }
+
+ virtual void Cut(const string& sentence, vector<string>& words) const = 0;
+
+ bool ResetSeparators(const string& s) {
+ symbols_.clear();
+ RuneStrArray runes;
+ if (!DecodeRunesInString(s, runes)) {
+ XLOG(ERROR) << "decode " << s << " failed";
+ return false;
+ }
+ for (size_t i = 0; i < runes.size(); i++) {
+ if (!symbols_.insert(runes[i].rune).second) {
+ XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists";
+ return false;
+ }
+ }
+ return true;
+ }
+ protected:
+ unordered_set<Rune> symbols_;
+}; // class SegmentBase
+
+} // cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp
new file mode 100644
index 0000000..4d99a31
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp
@@ -0,0 +1,23 @@
+#ifndef CPPJIEBA_SEGMENTTAGGED_H
+#define CPPJIEBA_SEGMENTTAGGED_H
+
+#include "SegmentBase.hpp"
+
+namespace cppjieba {
+
+class SegmentTagged : public SegmentBase{
+ public:
+ SegmentTagged() {
+ }
+ virtual ~SegmentTagged() {
+ }
+
+ virtual bool Tag(const string& src, vector<pair<string, string> >& res) const = 0;
+
+ virtual const DictTrie* GetDictTrie() const = 0;
+
+}; // class SegmentTagged
+
+} // cppjieba
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp
new file mode 100644
index 0000000..292d0a8
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp
@@ -0,0 +1,190 @@
+#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H
+#define CPPJIEBA_TEXTRANK_EXTRACTOR_H
+
+#include <cmath>
+#include "Jieba.hpp"
+
+namespace cppjieba {
+ using namespace limonp;
+ using namespace std;
+
+ class TextRankExtractor {
+ public:
+ typedef struct _Word {string word;vector<size_t> offsets;double weight;} Word; // struct Word
+ private:
+ typedef std::map<string,Word> WordMap;
+
+ class WordGraph{
+ private:
+ typedef double Score;
+ typedef string Node;
+ typedef std::set<Node> NodeSet;
+
+ typedef std::map<Node,double> Edges;
+ typedef std::map<Node,Edges> Graph;
+ //typedef std::unordered_map<Node,double> Edges;
+ //typedef std::unordered_map<Node,Edges> Graph;
+
+ double d;
+ Graph graph;
+ NodeSet nodeSet;
+ public:
+ WordGraph(): d(0.85) {};
+ WordGraph(double in_d): d(in_d) {};
+
+ void addEdge(Node start,Node end,double weight){
+ Edges temp;
+ Edges::iterator gotEdges;
+ nodeSet.insert(start);
+ nodeSet.insert(end);
+ graph[start][end]+=weight;
+ graph[end][start]+=weight;
+ }
+
+ void rank(WordMap &ws,size_t rankTime=10){
+ WordMap outSum;
+ Score wsdef, min_rank, max_rank;
+
+ if( graph.size() == 0)
+ return;
+
+ wsdef = 1.0 / graph.size();
+
+ for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){
+ // edges->first start鑺傜偣锛沞dge->first end鑺傜偣锛沞dge->second 鏉冮噸
+ ws[edges->first].word=edges->first;
+ ws[edges->first].weight=wsdef;
+ outSum[edges->first].weight=0;
+ for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){
+ outSum[edges->first].weight+=edge->second;
+ }
+ }
+ //sort(nodeSet.begin(),nodeSet.end()); 鏄惁闇�瑕佹帓搴�?
+ for( size_t i=0; i<rankTime; i++ ){
+ for(NodeSet::iterator node = nodeSet.begin(); node != nodeSet.end(); node++ ){
+ double s = 0;
+ for( Edges::iterator edge= graph[*node].begin(); edge != graph[*node].end(); edge++ )
+ // edge->first end鑺傜偣锛沞dge->second 鏉冮噸
+ s += edge->second / outSum[edge->first].weight * ws[edge->first].weight;
+ ws[*node].weight = (1 - d) + d * s;
+ }
+ }
+
+ min_rank=max_rank=ws.begin()->second.weight;
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+ if( i->second.weight < min_rank ){
+ min_rank = i->second.weight;
+ }
+ if( i->second.weight > max_rank ){
+ max_rank = i->second.weight;
+ }
+ }
+ for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){
+ ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0);
+ }
+ }
+ };
+
+ public:
+ TextRankExtractor(const string& dictPath,
+ const string& hmmFilePath,
+ const string& stopWordPath,
+ const string& userDict = "")
+ : segment_(dictPath, hmmFilePath, userDict) {
+ LoadStopWordDict(stopWordPath);
+ }
+ TextRankExtractor(const DictTrie* dictTrie,
+ const HMMModel* model,
+ const string& stopWordPath)
+ : segment_(dictTrie, model) {
+ LoadStopWordDict(stopWordPath);
+ }
+ TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) {
+ LoadStopWordDict(stopWordPath);
+ }
+ ~TextRankExtractor() {
+ }
+
+ void Extract(const string& sentence, vector<string>& keywords, size_t topN) const {
+ vector<Word> topWords;
+ Extract(sentence, topWords, topN);
+ for (size_t i = 0; i < topWords.size(); i++) {
+ keywords.push_back(topWords[i].word);
+ }
+ }
+
+ void Extract(const string& sentence, vector<pair<string, double> >& keywords, size_t topN) const {
+ vector<Word> topWords;
+ Extract(sentence, topWords, topN);
+ for (size_t i = 0; i < topWords.size(); i++) {
+ keywords.push_back(pair<string, double>(topWords[i].word, topWords[i].weight));
+ }
+ }
+
+ void Extract(const string& sentence, vector<Word>& keywords, size_t topN, size_t span=5,size_t rankTime=10) const {
+ vector<string> words;
+ segment_.Cut(sentence, words);
+
+ TextRankExtractor::WordGraph graph;
+ WordMap wordmap;
+ size_t offset = 0;
+
+ for(size_t i=0; i < words.size(); i++){
+ size_t t = offset;
+ offset += words[i].size();
+ if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) {
+ continue;
+ }
+ for(size_t j=i+1,skip=0;j<i+span+skip && j<words.size();j++){
+ if (IsSingleWord(words[j]) || stopWords_.find(words[j]) != stopWords_.end()) {
+ skip++;
+ continue;
+ }
+ graph.addEdge(words[i],words[j],1);
+ }
+ wordmap[words[i]].offsets.push_back(t);
+ }
+ if (offset != sentence.size()) {
+ XLOG(ERROR) << "words illegal";
+ return;
+ }
+
+ graph.rank(wordmap,rankTime);
+
+ keywords.clear();
+ keywords.reserve(wordmap.size());
+ for (WordMap::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) {
+ keywords.push_back(itr->second);
+ }
+
+ topN = min(topN, keywords.size());
+ partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare);
+ keywords.resize(topN);
+ }
+ private:
+ void LoadStopWordDict(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ XCHECK(ifs.is_open()) << "open " << filePath << " failed";
+ string line ;
+ while (getline(ifs, line)) {
+ stopWords_.insert(line);
+ }
+ assert(stopWords_.size());
+ }
+
+ static bool Compare(const Word &x,const Word &y){
+ return x.weight > y.weight;
+ }
+
+ MixSegment segment_;
+ unordered_set<string> stopWords_;
+ }; // class TextRankExtractor
+
+ inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) {
+ return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}";
+ }
+} // namespace cppjieba
+
+#endif
+
+
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp
new file mode 100644
index 0000000..e6f71b1
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp
@@ -0,0 +1,200 @@
+#ifndef CPPJIEBA_TRIE_HPP
+#define CPPJIEBA_TRIE_HPP
+
+#include <vector>
+#include <queue>
+#include "limonp/StdExtension.hpp"
+#include "Unicode.hpp"
+
+namespace cppjieba {
+
+using namespace std;
+
+const size_t MAX_WORD_LENGTH = 512;
+
+struct DictUnit {
+ Unicode word;
+ double weight;
+ string tag;
+}; // struct DictUnit
+
+// for debugging
+// inline ostream & operator << (ostream& os, const DictUnit& unit) {
+// string s;
+// s << unit.word;
+// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight);
+// }
+
+struct Dag {
+ RuneStr runestr;
+ // [offset, nexts.first]
+ limonp::LocalVector<pair<size_t, const DictUnit*> > nexts;
+ const DictUnit * pInfo;
+ double weight;
+ size_t nextPos; // TODO
+ Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) {
+ }
+}; // struct Dag
+
+typedef Rune TrieKey;
+
+class TrieNode {
+ public :
+ TrieNode(): next(NULL), ptValue(NULL) {
+ }
+ public:
+ typedef unordered_map<TrieKey, TrieNode*> NextMap;
+ NextMap *next;
+ const DictUnit *ptValue;
+};
+
+class Trie {
+ public:
+ Trie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers)
+ : root_(new TrieNode) {
+ CreateTrie(keys, valuePointers);
+ }
+ ~Trie() {
+ DeleteNode(root_);
+ }
+
+ const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const {
+ if (begin == end) {
+ return NULL;
+ }
+
+ const TrieNode* ptNode = root_;
+ TrieNode::NextMap::const_iterator citer;
+ for (RuneStrArray::const_iterator it = begin; it != end; it++) {
+ if (NULL == ptNode->next) {
+ return NULL;
+ }
+ citer = ptNode->next->find(it->rune);
+ if (ptNode->next->end() == citer) {
+ return NULL;
+ }
+ ptNode = citer->second;
+ }
+ return ptNode->ptValue;
+ }
+
+ void Find(RuneStrArray::const_iterator begin,
+ RuneStrArray::const_iterator end,
+ vector<struct Dag>&res,
+ size_t max_word_len = MAX_WORD_LENGTH) const {
+ assert(root_ != NULL);
+ res.resize(end - begin);
+
+ const TrieNode *ptNode = NULL;
+ TrieNode::NextMap::const_iterator citer;
+ for (size_t i = 0; i < size_t(end - begin); i++) {
+ res[i].runestr = *(begin + i);
+
+ if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) {
+ ptNode = citer->second;
+ } else {
+ ptNode = NULL;
+ }
+ if (ptNode != NULL) {
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, ptNode->ptValue));
+ } else {
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(i, static_cast<const DictUnit*>(NULL)));
+ }
+
+ for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) {
+ if (ptNode == NULL || ptNode->next == NULL) {
+ break;
+ }
+ citer = ptNode->next->find((begin + j)->rune);
+ if (ptNode->next->end() == citer) {
+ break;
+ }
+ ptNode = citer->second;
+ if (NULL != ptNode->ptValue) {
+ res[i].nexts.push_back(pair<size_t, const DictUnit*>(j, ptNode->ptValue));
+ }
+ }
+ }
+ }
+
+ void InsertNode(const Unicode& key, const DictUnit* ptValue) {
+ if (key.begin() == key.end()) {
+ return;
+ }
+
+ TrieNode::NextMap::const_iterator kmIter;
+ TrieNode *ptNode = root_;
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
+ if (NULL == ptNode->next) {
+ ptNode->next = new TrieNode::NextMap;
+ }
+ kmIter = ptNode->next->find(*citer);
+ if (ptNode->next->end() == kmIter) {
+ TrieNode *nextNode = new TrieNode;
+
+ ptNode->next->insert(make_pair(*citer, nextNode));
+ ptNode = nextNode;
+ } else {
+ ptNode = kmIter->second;
+ }
+ }
+ assert(ptNode != NULL);
+ ptNode->ptValue = ptValue;
+ }
+ void DeleteNode(const Unicode& key, const DictUnit* ptValue) {
+ if (key.begin() == key.end()) {
+ return;
+ }
+ //瀹氫箟涓�涓狽extMap杩唬鍣�
+ TrieNode::NextMap::const_iterator kmIter;
+ //瀹氫箟涓�涓寚鍚憆oot鐨凾rieNode鎸囬拡
+ TrieNode *ptNode = root_;
+ for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) {
+ //閾捐〃涓嶅瓨鍦ㄥ厓绱�
+ if (NULL == ptNode->next) {
+ return;
+ }
+ kmIter = ptNode->next->find(*citer);
+ //濡傛灉map涓笉瀛樺湪,璺冲嚭寰幆
+ if (ptNode->next->end() == kmIter) {
+ break;
+ }
+ //浠巙nordered_map涓摝闄よ椤�
+ ptNode->next->erase(*citer);
+ //鍒犻櫎璇ode
+ ptNode = kmIter->second;
+ delete ptNode;
+ break;
+ }
+ return;
+ }
+ private:
+ void CreateTrie(const vector<Unicode>& keys, const vector<const DictUnit*>& valuePointers) {
+ if (valuePointers.empty() || keys.empty()) {
+ return;
+ }
+ assert(keys.size() == valuePointers.size());
+
+ for (size_t i = 0; i < keys.size(); i++) {
+ InsertNode(keys[i], valuePointers[i]);
+ }
+ }
+
+ void DeleteNode(TrieNode* node) {
+ if (NULL == node) {
+ return;
+ }
+ if (NULL != node->next) {
+ for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) {
+ DeleteNode(it->second);
+ }
+ delete node->next;
+ }
+ delete node;
+ }
+
+ TrieNode* root_;
+}; // class Trie
+} // namespace cppjieba
+
+#endif // CPPJIEBA_TRIE_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp
new file mode 100644
index 0000000..7f06456
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp
@@ -0,0 +1,227 @@
+#ifndef CPPJIEBA_UNICODE_H
+#define CPPJIEBA_UNICODE_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string>
+#include <vector>
+#include <ostream>
+#include "limonp/LocalVector.hpp"
+
+namespace cppjieba {
+
+using std::string;
+using std::vector;
+
+typedef uint32_t Rune;
+
+struct Word {
+ string word;
+ uint32_t offset;
+ uint32_t unicode_offset;
+ uint32_t unicode_length;
+ Word(const string& w, uint32_t o)
+ : word(w), offset(o) {
+ }
+ Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length)
+ : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+ }
+}; // struct Word
+
+inline std::ostream& operator << (std::ostream& os, const Word& w) {
+ return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}";
+}
+
+struct RuneStr {
+ Rune rune;
+ uint32_t offset;
+ uint32_t len;
+ uint32_t unicode_offset;
+ uint32_t unicode_length;
+ RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) {
+ }
+ RuneStr(Rune r, uint32_t o, uint32_t l)
+ : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) {
+ }
+ RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length)
+ : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) {
+ }
+}; // struct RuneStr
+
+inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
+ return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}";
+}
+
+typedef limonp::LocalVector<Rune> Unicode;
+typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
+
+// [left, right]
+struct WordRange {
+ RuneStrArray::const_iterator left;
+ RuneStrArray::const_iterator right;
+ WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r)
+ : left(l), right(r) {
+ }
+ size_t Length() const {
+ return right - left + 1;
+ }
+ bool IsAllAscii() const {
+ for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) {
+ if (iter->rune >= 0x80) {
+ return false;
+ }
+ }
+ return true;
+ }
+}; // struct WordRange
+
+struct RuneStrLite {
+ uint32_t rune;
+ uint32_t len;
+ RuneStrLite(): rune(0), len(0) {
+ }
+ RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) {
+ }
+}; // struct RuneStrLite
+
+inline RuneStrLite DecodeRuneInString(const char* str, size_t len) {
+ RuneStrLite rp(0, 0);
+ if (str == NULL || len == 0) {
+ return rp;
+ }
+ if (!(str[0] & 0x80)) { // 0xxxxxxx
+ // 7bit, total 7bit
+ rp.rune = (uint8_t)(str[0]) & 0x7f;
+ rp.len = 1;
+ } else if ((uint8_t)str[0] <= 0xdf && 1 < len) {
+ // 110xxxxxx
+ // 5bit, total 5bit
+ rp.rune = (uint8_t)(str[0]) & 0x1f;
+
+ // 6bit, total 11bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
+ rp.len = 2;
+ } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx
+ // 4bit, total 4bit
+ rp.rune = (uint8_t)(str[0]) & 0x0f;
+
+ // 6bit, total 10bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
+
+ // 6bit, total 16bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
+
+ rp.len = 3;
+ } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx
+ // 3bit, total 3bit
+ rp.rune = (uint8_t)(str[0]) & 0x07;
+
+ // 6bit, total 9bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[1]) & 0x3f;
+
+ // 6bit, total 15bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[2]) & 0x3f;
+
+ // 6bit, total 21bit
+ rp.rune <<= 6;
+ rp.rune |= (uint8_t)(str[3]) & 0x3f;
+
+ rp.len = 4;
+ } else {
+ rp.rune = 0;
+ rp.len = 0;
+ }
+ return rp;
+}
+
+inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) {
+ runes.clear();
+ runes.reserve(len / 2);
+ for (uint32_t i = 0, j = 0; i < len;) {
+ RuneStrLite rp = DecodeRuneInString(s + i, len - i);
+ if (rp.len == 0) {
+ runes.clear();
+ return false;
+ }
+ RuneStr x(rp.rune, i, rp.len, j, 1);
+ runes.push_back(x);
+ i += rp.len;
+ ++j;
+ }
+ return true;
+}
+
+inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) {
+ return DecodeRunesInString(s.c_str(), s.size(), runes);
+}
+
+inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) {
+ unicode.clear();
+ RuneStrArray runes;
+ if (!DecodeRunesInString(s, len, runes)) {
+ return false;
+ }
+ unicode.reserve(runes.size());
+ for (size_t i = 0; i < runes.size(); i++) {
+ unicode.push_back(runes[i].rune);
+ }
+ return true;
+}
+
+inline bool IsSingleWord(const string& str) {
+ RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size());
+ return rp.len == str.size();
+}
+
+inline bool DecodeRunesInString(const string& s, Unicode& unicode) {
+ return DecodeRunesInString(s.c_str(), s.size(), unicode);
+}
+
+inline Unicode DecodeRunesInString(const string& s) {
+ Unicode result;
+ DecodeRunesInString(s, result);
+ return result;
+}
+
+
+// [left, right]
+inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+ assert(right->offset >= left->offset);
+ uint32_t len = right->offset - left->offset + right->len;
+ uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length;
+ return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length);
+}
+
+inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) {
+ assert(right->offset >= left->offset);
+ uint32_t len = right->offset - left->offset + right->len;
+ return s.substr(left->offset, len);
+}
+
+inline void GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs, vector<Word>& words) {
+ for (size_t i = 0; i < wrs.size(); i++) {
+ words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right));
+ }
+}
+
+inline vector<Word> GetWordsFromWordRanges(const string& s, const vector<WordRange>& wrs) {
+ vector<Word> result;
+ GetWordsFromWordRanges(s, wrs, result);
+ return result;
+}
+
+inline void GetStringsFromWords(const vector<Word>& words, vector<string>& strs) {
+ strs.resize(words.size());
+ for (size_t i = 0; i < words.size(); ++i) {
+ strs[i] = words[i].word;
+ }
+}
+
+} // namespace cppjieba
+
+#endif // CPPJIEBA_UNICODE_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp
new file mode 100644
index 0000000..ba3abe0
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp
@@ -0,0 +1,70 @@
+/************************************
+ * file enc : ascii
+ * author : wuyanyi09@gmail.com
+ ************************************/
+
+#ifndef LIMONP_ARGV_FUNCTS_H
+#define LIMONP_ARGV_FUNCTS_H
+
+#include <set>
+#include <sstream>
+#include "StringUtil.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+class ArgvContext {
+ public :
+ ArgvContext(int argc, const char* const * argv) {
+ for(int i = 0; i < argc; i++) {
+ if(StartsWith(argv[i], "-")) {
+ if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) {
+ mpss_[argv[i]] = argv[i+1];
+ i++;
+ } else {
+ sset_.insert(argv[i]);
+ }
+ } else {
+ args_.push_back(argv[i]);
+ }
+ }
+ }
+ ~ArgvContext() {
+ }
+
+ friend ostream& operator << (ostream& os, const ArgvContext& args);
+ string operator [](size_t i) const {
+ if(i < args_.size()) {
+ return args_[i];
+ }
+ return "";
+ }
+ string operator [](const string& key) const {
+ map<string, string>::const_iterator it = mpss_.find(key);
+ if(it != mpss_.end()) {
+ return it->second;
+ }
+ return "";
+ }
+
+ bool HasKey(const string& key) const {
+ if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) {
+ return true;
+ }
+ return false;
+ }
+
+ private:
+ vector<string> args_;
+ map<string, string> mpss_;
+ set<string> sset_;
+}; // class ArgvContext
+
+inline ostream& operator << (ostream& os, const ArgvContext& args) {
+ return os<<args.args_<<args.mpss_<<args.sset_;
+}
+
+} // namespace limonp
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BlockingQueue.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BlockingQueue.hpp
new file mode 100644
index 0000000..a441ffb
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BlockingQueue.hpp
@@ -0,0 +1,49 @@
+#ifndef LIMONP_BLOCKINGQUEUE_HPP
+#define LIMONP_BLOCKINGQUEUE_HPP
+
+#include <queue>
+#include "Condition.hpp"
+
+namespace limonp {
+template<class T>
+class BlockingQueue: NonCopyable {
+ public:
+ BlockingQueue()
+ : mutex_(), notEmpty_(mutex_), queue_() {
+ }
+
+ void Push(const T& x) {
+ MutexLockGuard lock(mutex_);
+ queue_.push(x);
+ notEmpty_.Notify(); // Wait morphing saves us
+ }
+
+ T Pop() {
+ MutexLockGuard lock(mutex_);
+ // always use a while-loop, due to spurious wakeup
+ while (queue_.empty()) {
+ notEmpty_.Wait();
+ }
+ assert(!queue_.empty());
+ T front(queue_.front());
+ queue_.pop();
+ return front;
+ }
+
+ size_t Size() const {
+ MutexLockGuard lock(mutex_);
+ return queue_.size();
+ }
+ bool Empty() const {
+ return Size() == 0;
+ }
+
+ private:
+ mutable MutexLock mutex_;
+ Condition notEmpty_;
+ std::queue<T> queue_;
+}; // class BlockingQueue
+
+} // namespace limonp
+
+#endif // LIMONP_BLOCKINGQUEUE_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp
new file mode 100644
index 0000000..598d099
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp
@@ -0,0 +1,67 @@
+#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
+#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
+
+#include "BoundedQueue.hpp"
+
+namespace limonp {
+
+template<typename T>
+class BoundedBlockingQueue : NonCopyable {
+ public:
+ explicit BoundedBlockingQueue(size_t maxSize)
+ : mutex_(),
+ notEmpty_(mutex_),
+ notFull_(mutex_),
+ queue_(maxSize) {
+ }
+
+ void Push(const T& x) {
+ MutexLockGuard lock(mutex_);
+ while (queue_.Full()) {
+ notFull_.Wait();
+ }
+ assert(!queue_.Full());
+ queue_.Push(x);
+ notEmpty_.Notify();
+ }
+
+ T Pop() {
+ MutexLockGuard lock(mutex_);
+ while (queue_.Empty()) {
+ notEmpty_.Wait();
+ }
+ assert(!queue_.Empty());
+ T res = queue_.Pop();
+ notFull_.Notify();
+ return res;
+ }
+
+ bool Empty() const {
+ MutexLockGuard lock(mutex_);
+ return queue_.Empty();
+ }
+
+ bool Full() const {
+ MutexLockGuard lock(mutex_);
+ return queue_.Full();
+ }
+
+ size_t size() const {
+ MutexLockGuard lock(mutex_);
+ return queue_.size();
+ }
+
+ size_t capacity() const {
+ return queue_.capacity();
+ }
+
+ private:
+ mutable MutexLock mutex_;
+ Condition notEmpty_;
+ Condition notFull_;
+ BoundedQueue<T> queue_;
+}; // class BoundedBlockingQueue
+
+} // namespace limonp
+
+#endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp
new file mode 100644
index 0000000..f52a107
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp
@@ -0,0 +1,65 @@
+#ifndef LIMONP_BOUNDED_QUEUE_HPP
+#define LIMONP_BOUNDED_QUEUE_HPP
+
+#include <vector>
+#include <fstream>
+#include <cassert>
+
+namespace limonp {
+using namespace std;
+template<class T>
+class BoundedQueue {
+ public:
+ explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) {
+ head_ = 0;
+ tail_ = 0;
+ size_ = 0;
+ assert(capacity_);
+ }
+ ~BoundedQueue() {
+ }
+
+ void Clear() {
+ head_ = 0;
+ tail_ = 0;
+ size_ = 0;
+ }
+ bool Empty() const {
+ return !size_;
+ }
+ bool Full() const {
+ return capacity_ == size_;
+ }
+ size_t Size() const {
+ return size_;
+ }
+ size_t Capacity() const {
+ return capacity_;
+ }
+
+ void Push(const T& t) {
+ assert(!Full());
+ circular_buffer_[tail_] = t;
+ tail_ = (tail_ + 1) % capacity_;
+ size_ ++;
+ }
+
+ T Pop() {
+ assert(!Empty());
+ size_t oldPos = head_;
+ head_ = (head_ + 1) % capacity_;
+ size_ --;
+ return circular_buffer_[oldPos];
+ }
+
+ private:
+ size_t head_;
+ size_t tail_;
+ size_t size_;
+ const size_t capacity_;
+ vector<T> circular_buffer_;
+
+}; // class BoundedQueue
+} // namespace limonp
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp
new file mode 100644
index 0000000..c9d9dd4
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp
@@ -0,0 +1,206 @@
+#ifndef LIMONP_CLOSURE_HPP
+#define LIMONP_CLOSURE_HPP
+
+namespace limonp {
+
+class ClosureInterface {
+ public:
+ virtual ~ClosureInterface() {
+ }
+ virtual void Run() = 0;
+};
+
+template <class Funct>
+class Closure0: public ClosureInterface {
+ public:
+ Closure0(Funct fun) {
+ fun_ = fun;
+ }
+ virtual ~Closure0() {
+ }
+ virtual void Run() {
+ (*fun_)();
+ }
+ private:
+ Funct fun_;
+};
+
+template <class Funct, class Arg1>
+class Closure1: public ClosureInterface {
+ public:
+ Closure1(Funct fun, Arg1 arg1) {
+ fun_ = fun;
+ arg1_ = arg1;
+ }
+ virtual ~Closure1() {
+ }
+ virtual void Run() {
+ (*fun_)(arg1_);
+ }
+ private:
+ Funct fun_;
+ Arg1 arg1_;
+};
+
+template <class Funct, class Arg1, class Arg2>
+class Closure2: public ClosureInterface {
+ public:
+ Closure2(Funct fun, Arg1 arg1, Arg2 arg2) {
+ fun_ = fun;
+ arg1_ = arg1;
+ arg2_ = arg2;
+ }
+ virtual ~Closure2() {
+ }
+ virtual void Run() {
+ (*fun_)(arg1_, arg2_);
+ }
+ private:
+ Funct fun_;
+ Arg1 arg1_;
+ Arg2 arg2_;
+};
+
+template <class Funct, class Arg1, class Arg2, class Arg3>
+class Closure3: public ClosureInterface {
+ public:
+ Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+ fun_ = fun;
+ arg1_ = arg1;
+ arg2_ = arg2;
+ arg3_ = arg3;
+ }
+ virtual ~Closure3() {
+ }
+ virtual void Run() {
+ (*fun_)(arg1_, arg2_, arg3_);
+ }
+ private:
+ Funct fun_;
+ Arg1 arg1_;
+ Arg2 arg2_;
+ Arg3 arg3_;
+};
+
+template <class Obj, class Funct>
+class ObjClosure0: public ClosureInterface {
+ public:
+ ObjClosure0(Obj* p, Funct fun) {
+ p_ = p;
+ fun_ = fun;
+ }
+ virtual ~ObjClosure0() {
+ }
+ virtual void Run() {
+ (p_->*fun_)();
+ }
+ private:
+ Obj* p_;
+ Funct fun_;
+};
+
+template <class Obj, class Funct, class Arg1>
+class ObjClosure1: public ClosureInterface {
+ public:
+ ObjClosure1(Obj* p, Funct fun, Arg1 arg1) {
+ p_ = p;
+ fun_ = fun;
+ arg1_ = arg1;
+ }
+ virtual ~ObjClosure1() {
+ }
+ virtual void Run() {
+ (p_->*fun_)(arg1_);
+ }
+ private:
+ Obj* p_;
+ Funct fun_;
+ Arg1 arg1_;
+};
+
+template <class Obj, class Funct, class Arg1, class Arg2>
+class ObjClosure2: public ClosureInterface {
+ public:
+ ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) {
+ p_ = p;
+ fun_ = fun;
+ arg1_ = arg1;
+ arg2_ = arg2;
+ }
+ virtual ~ObjClosure2() {
+ }
+ virtual void Run() {
+ (p_->*fun_)(arg1_, arg2_);
+ }
+ private:
+ Obj* p_;
+ Funct fun_;
+ Arg1 arg1_;
+ Arg2 arg2_;
+};
+template <class Obj, class Funct, class Arg1, class Arg2, class Arg3>
+class ObjClosure3: public ClosureInterface {
+ public:
+ ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+ p_ = p;
+ fun_ = fun;
+ arg1_ = arg1;
+ arg2_ = arg2;
+ arg3_ = arg3;
+ }
+ virtual ~ObjClosure3() {
+ }
+ virtual void Run() {
+ (p_->*fun_)(arg1_, arg2_, arg3_);
+ }
+ private:
+ Obj* p_;
+ Funct fun_;
+ Arg1 arg1_;
+ Arg2 arg2_;
+ Arg3 arg3_;
+};
+
+template<class R>
+ClosureInterface* NewClosure(R (*fun)()) {
+ return new Closure0<R (*)()>(fun);
+}
+
+template<class R, class Arg1>
+ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) {
+ return new Closure1<R (*)(Arg1), Arg1>(fun, arg1);
+}
+
+template<class R, class Arg1, class Arg2>
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+ return new Closure2<R (*)(Arg1, Arg2), Arg1, Arg2>(fun, arg1, arg2);
+}
+
+template<class R, class Arg1, class Arg2, class Arg3>
+ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+ return new Closure3<R (*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(fun, arg1, arg2, arg3);
+}
+
+template<class R, class Obj>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) {
+ return new ObjClosure0<Obj, R (Obj::* )()>(obj, fun);
+}
+
+template<class R, class Obj, class Arg1>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) {
+ return new ObjClosure1<Obj, R (Obj::* )(Arg1), Arg1>(obj, fun, arg1);
+}
+
+template<class R, class Obj, class Arg1, class Arg2>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) {
+ return new ObjClosure2<Obj, R (Obj::*)(Arg1, Arg2), Arg1, Arg2>(obj, fun, arg1, arg2);
+}
+
+template<class R, class Obj, class Arg1, class Arg2, class Arg3>
+ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) {
+ return new ObjClosure3<Obj, R (Obj::*)(Arg1, Arg2, Arg3), Arg1, Arg2, Arg3>(obj, fun, arg1, arg2, arg3);
+}
+
+} // namespace limonp
+
+#endif // LIMONP_CLOSURE_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp
new file mode 100644
index 0000000..04edd7e
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp
@@ -0,0 +1,31 @@
+#ifndef LIMONP_COLOR_PRINT_HPP
+#define LIMONP_COLOR_PRINT_HPP
+
+#include <string>
+#include <stdarg.h>
+
+namespace limonp {
+
+using std::string;
+
+enum Color {
+ BLACK = 30,
+ RED,
+ GREEN,
+ YELLOW,
+ BLUE,
+ PURPLE
+}; // enum Color
+
+static void ColorPrintln(enum Color color, const char * fmt, ...) {
+ va_list ap;
+ printf("\033[0;%dm", color);
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly
+}
+
+} // namespace limonp
+
+#endif // LIMONP_COLOR_PRINT_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp
new file mode 100644
index 0000000..656a61d
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp
@@ -0,0 +1,38 @@
+#ifndef LIMONP_CONDITION_HPP
+#define LIMONP_CONDITION_HPP
+
+#include "MutexLock.hpp"
+
+namespace limonp {
+
+class Condition : NonCopyable {
+ public:
+ explicit Condition(MutexLock& mutex)
+ : mutex_(mutex) {
+ XCHECK(!pthread_cond_init(&pcond_, NULL));
+ }
+
+ ~Condition() {
+ XCHECK(!pthread_cond_destroy(&pcond_));
+ }
+
+ void Wait() {
+ XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex()));
+ }
+
+ void Notify() {
+ XCHECK(!pthread_cond_signal(&pcond_));
+ }
+
+ void NotifyAll() {
+ XCHECK(!pthread_cond_broadcast(&pcond_));
+ }
+
+ private:
+ MutexLock& mutex_;
+ pthread_cond_t pcond_;
+}; // class Condition
+
+} // namespace limonp
+
+#endif // LIMONP_CONDITION_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp
new file mode 100644
index 0000000..c98f222
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp
@@ -0,0 +1,103 @@
+/************************************
+ * file enc : utf8
+ * author : wuyanyi09@gmail.com
+ ************************************/
+#ifndef LIMONP_CONFIG_H
+#define LIMONP_CONFIG_H
+
+#include <map>
+#include <fstream>
+#include <iostream>
+#include <assert.h>
+#include "StringUtil.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+class Config {
+ public:
+ explicit Config(const string& filePath) {
+ LoadFile(filePath);
+ }
+
+ operator bool () {
+ return !map_.empty();
+ }
+
+ string Get(const string& key, const string& defaultvalue) const {
+ map<string, string>::const_iterator it = map_.find(key);
+ if(map_.end() != it) {
+ return it->second;
+ }
+ return defaultvalue;
+ }
+ int Get(const string& key, int defaultvalue) const {
+ string str = Get(key, "");
+ if("" == str) {
+ return defaultvalue;
+ }
+ return atoi(str.c_str());
+ }
+ const char* operator [] (const char* key) const {
+ if(NULL == key) {
+ return NULL;
+ }
+ map<string, string>::const_iterator it = map_.find(key);
+ if(map_.end() != it) {
+ return it->second.c_str();
+ }
+ return NULL;
+ }
+
+ string GetConfigInfo() const {
+ string res;
+ res << *this;
+ return res;
+ }
+
+ private:
+ void LoadFile(const string& filePath) {
+ ifstream ifs(filePath.c_str());
+ assert(ifs);
+ string line;
+ vector<string> vecBuf;
+ size_t lineno = 0;
+ while(getline(ifs, line)) {
+ lineno ++;
+ Trim(line);
+ if(line.empty() || StartsWith(line, "#")) {
+ continue;
+ }
+ vecBuf.clear();
+ Split(line, vecBuf, "=");
+ if(2 != vecBuf.size()) {
+ fprintf(stderr, "line[%s] illegal.\n", line.c_str());
+ assert(false);
+ continue;
+ }
+ string& key = vecBuf[0];
+ string& value = vecBuf[1];
+ Trim(key);
+ Trim(value);
+ if(!map_.insert(make_pair(key, value)).second) {
+ fprintf(stderr, "key[%s] already exits.\n", key.c_str());
+ assert(false);
+ continue;
+ }
+ }
+ ifs.close();
+ }
+
+ friend ostream& operator << (ostream& os, const Config& config);
+
+ map<string, string> map_;
+}; // class Config
+
+inline ostream& operator << (ostream& os, const Config& config) {
+ return os << config.map_;
+}
+
+} // namespace limonp
+
+#endif // LIMONP_CONFIG_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp
new file mode 100644
index 0000000..56a478a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp
@@ -0,0 +1,74 @@
+#ifndef LIMONP_FILELOCK_HPP
+#define LIMONP_FILELOCK_HPP
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string>
+#include <string.h>
+#include <assert.h>
+
+namespace limonp {
+
+using std::string;
+
+class FileLock {
+ public:
+ FileLock() : fd_(-1), ok_(true) {
+ }
+ ~FileLock() {
+ if(fd_ > 0) {
+ Close();
+ }
+ }
+ void Open(const string& fname) {
+ assert(fd_ == -1);
+ fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644);
+ if(fd_ < 0) {
+ ok_ = false;
+ err_ = strerror(errno);
+ }
+ }
+ void Close() {
+ ::close(fd_);
+ }
+ void Lock() {
+ if(LockOrUnlock(fd_, true) < 0) {
+ ok_ = false;
+ err_ = strerror(errno);
+ }
+ }
+ void UnLock() {
+ if(LockOrUnlock(fd_, false) < 0) {
+ ok_ = false;
+ err_ = strerror(errno);
+ }
+ }
+ bool Ok() const {
+ return ok_;
+ }
+ string Error() const {
+ return err_;
+ }
+ private:
+ static int LockOrUnlock(int fd, bool lock) {
+ errno = 0;
+ struct flock f;
+ memset(&f, 0, sizeof(f));
+ f.l_type = (lock ? F_WRLCK : F_UNLCK);
+ f.l_whence = SEEK_SET;
+ f.l_start = 0;
+ f.l_len = 0; // Lock/unlock entire file
+ return fcntl(fd, F_SETLK, &f);
+ }
+
+ int fd_;
+ bool ok_;
+ string err_;
+}; // class FileLock
+
+}// namespace limonp
+
+#endif // LIMONP_FILELOCK_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp
new file mode 100644
index 0000000..2076682
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp
@@ -0,0 +1,7 @@
+#ifndef LIMONP_FORCE_PUBLIC_H
+#define LIMONP_FORCE_PUBLIC_H
+
+#define private public
+#define protected public
+
+#endif // LIMONP_FORCE_PUBLIC_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp
new file mode 100644
index 0000000..11339cc
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp
@@ -0,0 +1,139 @@
+#ifndef LIMONP_LOCAL_VECTOR_HPP
+#define LIMONP_LOCAL_VECTOR_HPP
+
+#include <iostream>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+namespace limonp {
+using namespace std;
+/*
+ * LocalVector<T> : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector<T> may be dangerous..
+ * LocalVector<T> is simple and not well-tested.
+ */
+const size_t LOCAL_VECTOR_BUFFER_SIZE = 16;
+template <class T>
+class LocalVector {
+ public:
+ typedef const T* const_iterator ;
+ typedef T value_type;
+ typedef size_t size_type;
+ private:
+ T buffer_[LOCAL_VECTOR_BUFFER_SIZE];
+ T * ptr_;
+ size_t size_;
+ size_t capacity_;
+ public:
+ LocalVector() {
+ init_();
+ };
+ LocalVector(const LocalVector<T>& vec) {
+ init_();
+ *this = vec;
+ }
+ LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster
+ init_();
+ while(begin != end) {
+ push_back(*begin++);
+ }
+ }
+ LocalVector(size_t size, const T& t) { // TODO: make it faster
+ init_();
+ while(size--) {
+ push_back(t);
+ }
+ }
+ ~LocalVector() {
+ if(ptr_ != buffer_) {
+ free(ptr_);
+ }
+ };
+ public:
+ LocalVector<T>& operator = (const LocalVector<T>& vec) {
+ clear();
+ size_ = vec.size();
+ capacity_ = vec.capacity();
+ if(vec.buffer_ == vec.ptr_) {
+ memcpy(static_cast<void*>(buffer_), vec.buffer_, sizeof(T) * size_);
+ ptr_ = buffer_;
+ } else {
+ ptr_ = (T*) malloc(vec.capacity() * sizeof(T));
+ assert(ptr_);
+ memcpy(static_cast<void*>(ptr_), vec.ptr_, vec.size() * sizeof(T));
+ }
+ return *this;
+ }
+ private:
+ void init_() {
+ ptr_ = buffer_;
+ size_ = 0;
+ capacity_ = LOCAL_VECTOR_BUFFER_SIZE;
+ }
+ public:
+ T& operator [] (size_t i) {
+ return ptr_[i];
+ }
+ const T& operator [] (size_t i) const {
+ return ptr_[i];
+ }
+ void push_back(const T& t) {
+ if(size_ == capacity_) {
+ assert(capacity_);
+ reserve(capacity_ * 2);
+ }
+ ptr_[size_ ++ ] = t;
+ }
+ void reserve(size_t size) {
+ if(size <= capacity_) {
+ return;
+ }
+ T * next = (T*)malloc(sizeof(T) * size);
+ assert(next);
+ T * old = ptr_;
+ ptr_ = next;
+ memcpy(static_cast<void*>(ptr_), old, sizeof(T) * capacity_);
+ capacity_ = size;
+ if(old != buffer_) {
+ free(old);
+ }
+ }
+ bool empty() const {
+ return 0 == size();
+ }
+ size_t size() const {
+ return size_;
+ }
+ size_t capacity() const {
+ return capacity_;
+ }
+ const_iterator begin() const {
+ return ptr_;
+ }
+ const_iterator end() const {
+ return ptr_ + size_;
+ }
+ void clear() {
+ if(ptr_ != buffer_) {
+ free(ptr_);
+ }
+ init_();
+ }
+};
+
+template <class T>
+ostream & operator << (ostream& os, const LocalVector<T>& vec) {
+ if(vec.empty()) {
+ return os << "[]";
+ }
+ os<<"[\""<<vec[0];
+ for(size_t i = 1; i < vec.size(); i++) {
+ os<<"\", \""<<vec[i];
+ }
+ os<<"\"]";
+ return os;
+}
+
+}
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Logging.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Logging.hpp
new file mode 100644
index 0000000..3fe3ada
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Logging.hpp
@@ -0,0 +1,90 @@
+#ifndef LIMONP_LOGGING_HPP
+#define LIMONP_LOGGING_HPP
+
+#include <sstream>
+#include <iostream>
+#include <cassert>
+#include <cstdlib>
+#include <ctime>
+
+#ifdef XLOG
+#error "XLOG has been defined already"
+#endif // XLOG
+#ifdef XCHECK
+#error "XCHECK has been defined already"
+#endif // XCHECK
+
+#define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream()
+#define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. "
+
+namespace limonp {
+
+enum {
+ LL_DEBUG = 0,
+ LL_INFO = 1,
+ LL_WARNING = 2,
+ LL_ERROR = 3,
+ LL_FATAL = 4,
+}; // enum
+
+static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"};
+static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S";
+
+class Logger {
+ public:
+ Logger(size_t level, const char* filename, int lineno)
+ : level_(level) {
+#ifdef LOGGING_LEVEL
+ if (level_ < LOGGING_LEVEL) {
+ return;
+ }
+#endif
+ assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY));
+
+ char buf[32];
+
+ time_t timeNow;
+ time(&timeNow);
+
+ struct tm tmNow;
+
+ #if defined(_WIN32) || defined(_WIN64)
+ errno_t e = localtime_s(&tmNow, &timeNow);
+ assert(e = 0);
+ #else
+ struct tm * tm_tmp = localtime_r(&timeNow, &tmNow);
+ assert(tm_tmp != nullptr);
+ #endif
+
+ strftime(buf, sizeof(buf), LOG_TIME_FORMAT, &tmNow);
+
+ stream_ << buf
+ << " " << filename
+ << ":" << lineno
+ << " " << LOG_LEVEL_ARRAY[level_]
+ << " ";
+ }
+ ~Logger() {
+#ifdef LOGGING_LEVEL
+ if (level_ < LOGGING_LEVEL) {
+ return;
+ }
+#endif
+ std::cerr << stream_.str() << std::endl;
+ if (level_ == LL_FATAL) {
+ abort();
+ }
+ }
+
+ std::ostream& Stream() {
+ return stream_;
+ }
+
+ private:
+ std::ostringstream stream_;
+ size_t level_;
+}; // class Logger
+
+} // namespace limonp
+
+#endif // LIMONP_LOGGING_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp
new file mode 100644
index 0000000..d30f3b5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp
@@ -0,0 +1,411 @@
+#ifndef __MD5_H__
+#define __MD5_H__
+
+// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+// rights reserved.
+
+// License to copy and use this software is granted provided that it
+// is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+// Algorithm" in all material mentioning or referencing this software
+// or this function.
+//
+// License is also granted to make and use derivative works provided
+// that such works are identified as "derived from the RSA Data
+// Security, Inc. MD5 Message-Digest Algorithm" in all material
+// mentioning or referencing the derived work.
+//
+// RSA Data Security, Inc. makes no representations concerning either
+// the merchantability of this software or the suitability of this
+// software for any particular purpose. It is provided "as is"
+// without express or implied warranty of any kind.
+//
+// These notices must be retained in any copies of any part of this
+// documentation and/or software.
+
+
+
+// The original md5 implementation avoids external libraries.
+// This version has dependency on stdio.h for file input and
+// string.h for memcpy.
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+
+namespace limonp {
+
+//#pragma region MD5 defines
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+
+// F, G, H and I are basic MD5 functions.
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+// ROTATE_LEFT rotates x left n bits.
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+//#pragma endregion
+
+
+typedef unsigned char BYTE ;
+
+// POINTER defines a generic pointer type
+typedef unsigned char *POINTER;
+
+// UINT2 defines a two byte word
+typedef unsigned short int UINT2;
+
+// UINT4 defines a four byte word
+typedef unsigned int UINT4;
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+// convenient object that wraps
+// the C-functions for use in C++ only
+class MD5 {
+ private:
+ struct __context_t {
+ UINT4 state[4]; /* state (ABCD) */
+ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */
+ unsigned char buffer[64]; /* input buffer */
+ } context ;
+
+ //#pragma region static helper functions
+ // The core of the MD5 algorithm is here.
+ // MD5 basic transformation. Transforms state based on block.
+ static void MD5Transform( UINT4 state[4], unsigned char block[64] ) {
+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ // Zeroize sensitive information.
+ memset((POINTER)x, 0, sizeof (x));
+ }
+
+ // Encodes input (UINT4) into output (unsigned char). Assumes len is
+ // a multiple of 4.
+ static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+ }
+
+ // Decodes input (unsigned char) into output (UINT4). Assumes len is
+ // a multiple of 4.
+ static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) {
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+ }
+ //#pragma endregion
+
+
+ public:
+ // MAIN FUNCTIONS
+ MD5() {
+ Init() ;
+ }
+
+ // MD5 initialization. Begins an MD5 operation, writing a new context.
+ void Init() {
+ context.count[0] = context.count[1] = 0;
+
+ // Load magic initialization constants.
+ context.state[0] = 0x67452301;
+ context.state[1] = 0xefcdab89;
+ context.state[2] = 0x98badcfe;
+ context.state[3] = 0x10325476;
+ }
+
+ // MD5 block update operation. Continues an MD5 message-digest
+ // operation, processing another message block, and updating the
+ // context.
+ void Update(
+ unsigned char *input, // input block
+ unsigned int inputLen ) { // length of input block
+ unsigned int i, index, partLen;
+
+ // Compute number of bytes mod 64
+ index = (unsigned int)((context.count[0] >> 3) & 0x3F);
+
+ // Update number of bits
+ if ((context.count[0] += ((UINT4)inputLen << 3))
+ < ((UINT4)inputLen << 3))
+ context.count[1]++;
+ context.count[1] += ((UINT4)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ // Transform as many times as possible.
+ if (inputLen >= partLen) {
+ memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen);
+ MD5Transform (context.state, context.buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context.state, &input[i]);
+
+ index = 0;
+ } else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i);
+ }
+
+ // MD5 finalization. Ends an MD5 message-digest operation, writing the
+ // the message digest and zeroizing the context.
+ // Writes to digestRaw
+ void Final() {
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ // Save number of bits
+ Encode( bits, context.count, 8 );
+
+ // Pad out to 56 mod 64.
+ index = (unsigned int)((context.count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ Update( PADDING, padLen );
+
+ // Append length (before padding)
+ Update( bits, 8 );
+
+ // Store state in digest
+ Encode( digestRaw, context.state, 16);
+
+ // Zeroize sensitive information.
+ memset((POINTER)&context, 0, sizeof (context));
+
+ writeToString() ;
+ }
+
+ /// Buffer must be 32+1 (nul) = 33 chars long at least
+ void writeToString() {
+ int pos ;
+
+ for( pos = 0 ; pos < 16 ; pos++ )
+ sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ;
+ }
+
+
+ public:
+ // an MD5 digest is a 16-byte number (32 hex digits)
+ BYTE digestRaw[ 16 ] ;
+
+ // This version of the digest is actually
+ // a "printf'd" version of the digest.
+ char digestChars[ 33 ] ;
+
+ /// Load a file from disk and digest it
+ // Digests a file and returns the result.
+ const char* digestFile( const char *filename ) {
+ if (NULL == filename || strcmp(filename, "") == 0)
+ return NULL;
+
+ Init() ;
+
+ FILE *file;
+
+ unsigned char buffer[1024] ;
+
+ if((file = fopen (filename, "rb")) == NULL) {
+ return NULL;
+ }
+ int len;
+ while( (len = fread( buffer, 1, 1024, file )) )
+ Update( buffer, len ) ;
+ Final();
+
+ fclose( file );
+
+ return digestChars ;
+ }
+
+ /// Digests a byte-array already in memory
+ const char* digestMemory( BYTE *memchunk, int len ) {
+ if (NULL == memchunk)
+ return NULL;
+
+ Init() ;
+ Update( memchunk, len ) ;
+ Final() ;
+
+ return digestChars ;
+ }
+
+ // Digests a string and prints the result.
+ const char* digestString(const char *string ) {
+ if (string == NULL)
+ return NULL;
+
+ Init() ;
+ Update( (unsigned char*)string, strlen(string) ) ;
+ Final() ;
+
+ return digestChars ;
+ }
+};
+
+inline bool md5String(const char* str, std::string& res) {
+ if (NULL == str) {
+ res = "";
+ return false;
+ }
+
+ MD5 md5;
+ const char *pRes = md5.digestString(str);
+ if (NULL == pRes) {
+ res = "";
+ return false;
+ }
+
+ res = pRes;
+ return true;
+}
+
+inline bool md5File(const char* filepath, std::string& res) {
+ if (NULL == filepath || strcmp(filepath, "") == 0) {
+ res = "";
+ return false;
+ }
+
+ MD5 md5;
+ const char *pRes = md5.digestFile(filepath);
+
+ if (NULL == pRes) {
+ res = "";
+ return false;
+ }
+
+ res = pRes;
+ return true;
+}
+}
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp
new file mode 100644
index 0000000..ea10d6d
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp
@@ -0,0 +1,51 @@
+#ifndef LIMONP_MUTEX_LOCK_HPP
+#define LIMONP_MUTEX_LOCK_HPP
+
+#include <pthread.h>
+#include "NonCopyable.hpp"
+#include "Logging.hpp"
+
+namespace limonp {
+
+class MutexLock: NonCopyable {
+ public:
+ MutexLock() {
+ XCHECK(!pthread_mutex_init(&mutex_, NULL));
+ }
+ ~MutexLock() {
+ XCHECK(!pthread_mutex_destroy(&mutex_));
+ }
+ pthread_mutex_t* GetPthreadMutex() {
+ return &mutex_;
+ }
+
+ private:
+ void Lock() {
+ XCHECK(!pthread_mutex_lock(&mutex_));
+ }
+ void Unlock() {
+ XCHECK(!pthread_mutex_unlock(&mutex_));
+ }
+ friend class MutexLockGuard;
+
+ pthread_mutex_t mutex_;
+}; // class MutexLock
+
+class MutexLockGuard: NonCopyable {
+ public:
+ explicit MutexLockGuard(MutexLock & mutex)
+ : mutex_(mutex) {
+ mutex_.Lock();
+ }
+ ~MutexLockGuard() {
+ mutex_.Unlock();
+ }
+ private:
+ MutexLock & mutex_;
+}; // class MutexLockGuard
+
+#define MutexLockGuard(x) XCHECK(false);
+
+} // namespace limonp
+
+#endif // LIMONP_MUTEX_LOCK_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp
new file mode 100644
index 0000000..145400f
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp
@@ -0,0 +1,21 @@
+/************************************
+ ************************************/
+#ifndef LIMONP_NONCOPYABLE_H
+#define LIMONP_NONCOPYABLE_H
+
+namespace limonp {
+
+class NonCopyable {
+ protected:
+ NonCopyable() {
+ }
+ ~NonCopyable() {
+ }
+ private:
+ NonCopyable(const NonCopyable& );
+ const NonCopyable& operator=(const NonCopyable& );
+}; // class NonCopyable
+
+} // namespace limonp
+
+#endif // LIMONP_NONCOPYABLE_H
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp
new file mode 100644
index 0000000..cf00e94
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp
@@ -0,0 +1,157 @@
+#ifndef LIMONP_STD_EXTEMSION_HPP
+#define LIMONP_STD_EXTEMSION_HPP
+
+#include <map>
+
+#ifdef __APPLE__
+#include <unordered_map>
+#include <unordered_set>
+#elif(__cplusplus >= 201103L)
+#include <unordered_map>
+#include <unordered_set>
+#elif defined _MSC_VER
+#include <unordered_map>
+#include <unordered_set>
+#else
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+namespace std {
+using std::tr1::unordered_map;
+using std::tr1::unordered_set;
+}
+
+#endif
+
+#include <set>
+#include <string>
+#include <vector>
+#include <deque>
+#include <fstream>
+#include <sstream>
+
+namespace std {
+
+template<typename T>
+ostream& operator << (ostream& os, const vector<T>& v) {
+ if(v.empty()) {
+ return os << "[]";
+ }
+ os<<"["<<v[0];
+ for(size_t i = 1; i < v.size(); i++) {
+ os<<", "<<v[i];
+ }
+ os<<"]";
+ return os;
+}
+
+template<>
+inline ostream& operator << (ostream& os, const vector<string>& v) {
+ if(v.empty()) {
+ return os << "[]";
+ }
+ os<<"[\""<<v[0];
+ for(size_t i = 1; i < v.size(); i++) {
+ os<<"\", \""<<v[i];
+ }
+ os<<"\"]";
+ return os;
+}
+
+template<typename T>
+ostream& operator << (ostream& os, const deque<T>& dq) {
+ if(dq.empty()) {
+ return os << "[]";
+ }
+ os<<"[\""<<dq[0];
+ for(size_t i = 1; i < dq.size(); i++) {
+ os<<"\", \""<<dq[i];
+ }
+ os<<"\"]";
+ return os;
+}
+
+
+template<class T1, class T2>
+ostream& operator << (ostream& os, const pair<T1, T2>& pr) {
+ os << pr.first << ":" << pr.second ;
+ return os;
+}
+
+
+template<class T>
+string& operator << (string& str, const T& obj) {
+ stringstream ss;
+ ss << obj; // call ostream& operator << (ostream& os,
+ return str = ss.str();
+}
+
+template<class T1, class T2>
+ostream& operator << (ostream& os, const map<T1, T2>& mp) {
+ if(mp.empty()) {
+ os<<"{}";
+ return os;
+ }
+ os<<'{';
+ typename map<T1, T2>::const_iterator it = mp.begin();
+ os<<*it;
+ it++;
+ while(it != mp.end()) {
+ os<<", "<<*it;
+ it++;
+ }
+ os<<'}';
+ return os;
+}
+template<class T1, class T2>
+ostream& operator << (ostream& os, const std::unordered_map<T1, T2>& mp) {
+ if(mp.empty()) {
+ return os << "{}";
+ }
+ os<<'{';
+ typename std::unordered_map<T1, T2>::const_iterator it = mp.begin();
+ os<<*it;
+ it++;
+ while(it != mp.end()) {
+ os<<", "<<*it++;
+ }
+ return os<<'}';
+}
+
+template<class T>
+ostream& operator << (ostream& os, const set<T>& st) {
+ if(st.empty()) {
+ os << "{}";
+ return os;
+ }
+ os<<'{';
+ typename set<T>::const_iterator it = st.begin();
+ os<<*it;
+ it++;
+ while(it != st.end()) {
+ os<<", "<<*it;
+ it++;
+ }
+ os<<'}';
+ return os;
+}
+
+template<class KeyType, class ContainType>
+bool IsIn(const ContainType& contain, const KeyType& key) {
+ return contain.end() != contain.find(key);
+}
+
+template<class T>
+basic_string<T> & operator << (basic_string<T> & s, ifstream & ifs) {
+ return s.assign((istreambuf_iterator<T>(ifs)), istreambuf_iterator<T>());
+}
+
+template<class T>
+ofstream & operator << (ofstream & ofs, const basic_string<T>& s) {
+ ostreambuf_iterator<T> itr (ofs);
+ copy(s.begin(), s.end(), itr);
+ return ofs;
+}
+
+} // namespace std
+
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp
new file mode 100644
index 0000000..ad3be56
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp
@@ -0,0 +1,405 @@
+/************************************
+ * file enc : ascii
+ * author : wuyanyi09@gmail.com
+ ************************************/
+#ifndef LIMONP_STR_FUNCTS_H
+#define LIMONP_STR_FUNCTS_H
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cctype>
+#include <map>
+#include <cassert>
+#include <ctime>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <memory.h>
+#include <functional>
+#include <locale>
+#include <sstream>
+#include <sys/types.h>
+#include <iterator>
+#include <algorithm>
+#include "StdExtension.hpp"
+
+namespace limonp {
+using namespace std;
+inline string StringFormat(const char* fmt, ...) {
+ int size = 256;
+ std::string str;
+ va_list ap;
+ while (1) {
+ str.resize(size);
+ va_start(ap, fmt);
+ int n = vsnprintf((char *)str.c_str(), size, fmt, ap);
+ va_end(ap);
+ if (n > -1 && n < size) {
+ str.resize(n);
+ return str;
+ }
+ if (n > -1)
+ size = n + 1;
+ else
+ size *= 2;
+ }
+ return str;
+}
+
+template<class T>
+void Join(T begin, T end, string& res, const string& connector) {
+ if(begin == end) {
+ return;
+ }
+ stringstream ss;
+ ss<<*begin;
+ begin++;
+ while(begin != end) {
+ ss << connector << *begin;
+ begin ++;
+ }
+ res = ss.str();
+}
+
+template<class T>
+string Join(T begin, T end, const string& connector) {
+ string res;
+ Join(begin ,end, res, connector);
+ return res;
+}
+
+inline string& Upper(string& str) {
+ transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper);
+ return str;
+}
+
+inline string& Lower(string& str) {
+ transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower);
+ return str;
+}
+
+inline bool IsSpace(unsigned c) {
+ // when passing large int as the argument of isspace, it core dump, so here need a type cast.
+ return c > 0xff ? false : std::isspace(c & 0xff) != 0;
+}
+
+inline std::string& LTrim(std::string &s) {
+#if defined(_MSC_VER) && _MSC_VER >= 1910
+ s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
+ return !std::isspace(ch);
+ }));
+#else
+ // Use lower version of MSVC
+ s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))));
+#endif
+ return s;
+}
+
+inline std::string& RTrim(std::string &s) {
+#if defined(_MSC_VER) && _MSC_VER >= 1910
+ // Use MSVC 2017 or higher version
+ s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
+ return !std::isspace(ch);
+ }).base(), s.end());
+#else
+ // Use lower version of MSVC
+ s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<unsigned, bool>(IsSpace))).base(), s.end());
+#endif
+ return s;
+}
+
+inline std::string& Trim(std::string &s) {
+ return LTrim(RTrim(s));
+}
+
+inline std::string& LTrim(std::string& s, char x) {
+#if defined(_MSC_VER) && _MSC_VER >= 1910
+ s.erase(s.begin(), std::find_if(s.begin(), s.end(),
+ [x](unsigned char c) { return !std::isspace(c) && c != x; }));
+#else
+ s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to<char>(), x))));
+#endif
+ return s;
+}
+
+inline std::string& RTrim(std::string& s, char x) {
+#if defined(_MSC_VER) && _MSC_VER >= 1910
+ s.erase(std::find_if(s.rbegin(), s.rend(),
+ [x](unsigned char c) { return !std::isspace(c) && c != x; }).base(), s.end());
+#else
+ s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to<char>(), x))).base(), s.end());
+#endif
+ return s;
+}
+
+inline std::string& Trim(std::string &s, char x) {
+ return LTrim(RTrim(s, x), x);
+}
+
+inline void Split(const string& src, vector<string>& res, const string& pattern, size_t maxsplit = string::npos) {
+ res.clear();
+ size_t Start = 0;
+ size_t end = 0;
+ string sub;
+ while(Start < src.size()) {
+ end = src.find_first_of(pattern, Start);
+ if(string::npos == end || res.size() >= maxsplit) {
+ sub = src.substr(Start);
+ res.push_back(sub);
+ return;
+ }
+ sub = src.substr(Start, end - Start);
+ res.push_back(sub);
+ Start = end + 1;
+ }
+ return;
+}
+
+inline vector<string> Split(const string& src, const string& pattern, size_t maxsplit = string::npos) {
+ vector<string> res;
+ Split(src, res, pattern, maxsplit);
+ return res;
+}
+
+inline bool StartsWith(const string& str, const string& prefix) {
+ if(prefix.length() > str.length()) {
+ return false;
+ }
+ return 0 == str.compare(0, prefix.length(), prefix);
+}
+
+inline bool EndsWith(const string& str, const string& suffix) {
+ if(suffix.length() > str.length()) {
+ return false;
+ }
+ return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix);
+}
+
+inline bool IsInStr(const string& str, char ch) {
+ return str.find(ch) != string::npos;
+}
+
+inline uint16_t TwocharToUint16(char high, char low) {
+ return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff));
+}
+
+template <class Uint16Container>
+bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) {
+ if(!str) {
+ return false;
+ }
+ char ch1, ch2;
+ uint16_t tmp;
+ vec.clear();
+ for(size_t i = 0; i < len;) {
+ if(!(str[i] & 0x80)) { // 0xxxxxxx
+ vec.push_back(str[i]);
+ i++;
+ } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx
+ ch1 = (str[i] >> 2) & 0x07;
+ ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 );
+ tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+ vec.push_back(tmp);
+ i += 2;
+ } else if((uint8_t)str[i] <= 0xef && i + 2 < len) {
+ ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f );
+ ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f);
+ tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff));
+ vec.push_back(tmp);
+ i += 3;
+ } else {
+ return false;
+ }
+ }
+ return true;
+}
+
+template <class Uint16Container>
+bool Utf8ToUnicode(const string& str, Uint16Container& vec) {
+ return Utf8ToUnicode(str.c_str(), str.size(), vec);
+}
+
+template <class Uint32Container>
+bool Utf8ToUnicode32(const string& str, Uint32Container& vec) {
+ uint32_t tmp;
+ vec.clear();
+ for(size_t i = 0; i < str.size();) {
+ if(!(str[i] & 0x80)) { // 0xxxxxxx
+ // 7bit, total 7bit
+ tmp = (uint8_t)(str[i]) & 0x7f;
+ i++;
+ } else if ((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx
+ // 5bit, total 5bit
+ tmp = (uint8_t)(str[i]) & 0x1f;
+
+ // 6bit, total 11bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+1]) & 0x3f;
+ i += 2;
+ } else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx
+ // 4bit, total 4bit
+ tmp = (uint8_t)(str[i]) & 0x0f;
+
+ // 6bit, total 10bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+ // 6bit, total 16bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+ i += 3;
+ } else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx
+ // 3bit, total 3bit
+ tmp = (uint8_t)(str[i]) & 0x07;
+
+ // 6bit, total 9bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+1]) & 0x3f;
+
+ // 6bit, total 15bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+2]) & 0x3f;
+
+ // 6bit, total 21bit
+ tmp <<= 6;
+ tmp |= (uint8_t)(str[i+3]) & 0x3f;
+
+ i += 4;
+ } else {
+ return false;
+ }
+ vec.push_back(tmp);
+ }
+ return true;
+}
+
+template <class Uint32ContainerConIter>
+void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) {
+ res.clear();
+ uint32_t ui;
+ while(begin != end) {
+ ui = *begin;
+ if(ui <= 0x7f) {
+ res += char(ui);
+ } else if(ui <= 0x7ff) {
+ res += char(((ui >> 6) & 0x1f) | 0xc0);
+ res += char((ui & 0x3f) | 0x80);
+ } else if(ui <= 0xffff) {
+ res += char(((ui >> 12) & 0x0f) | 0xe0);
+ res += char(((ui >> 6) & 0x3f) | 0x80);
+ res += char((ui & 0x3f) | 0x80);
+ } else {
+ res += char(((ui >> 18) & 0x03) | 0xf0);
+ res += char(((ui >> 12) & 0x3f) | 0x80);
+ res += char(((ui >> 6) & 0x3f) | 0x80);
+ res += char((ui & 0x3f) | 0x80);
+ }
+ begin ++;
+ }
+}
+
+template <class Uint16ContainerConIter>
+void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+ res.clear();
+ uint16_t ui;
+ while(begin != end) {
+ ui = *begin;
+ if(ui <= 0x7f) {
+ res += char(ui);
+ } else if(ui <= 0x7ff) {
+ res += char(((ui>>6) & 0x1f) | 0xc0);
+ res += char((ui & 0x3f) | 0x80);
+ } else {
+ res += char(((ui >> 12) & 0x0f )| 0xe0);
+ res += char(((ui>>6) & 0x3f )| 0x80 );
+ res += char((ui & 0x3f) | 0x80);
+ }
+ begin ++;
+ }
+}
+
+
+template <class Uint16Container>
+bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) {
+ vec.clear();
+ if(!str) {
+ return true;
+ }
+ size_t i = 0;
+ while(i < len) {
+ if(0 == (str[i] & 0x80)) {
+ vec.push_back(uint16_t(str[i]));
+ i++;
+ } else {
+ if(i + 1 < len) { //&& (str[i+1] & 0x80))
+ uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff));
+ vec.push_back(tmp);
+ i += 2;
+ } else {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+template <class Uint16Container>
+bool GBKTrans(const string& str, Uint16Container& vec) {
+ return GBKTrans(str.c_str(), str.size(), vec);
+}
+
+template <class Uint16ContainerConIter>
+void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) {
+ res.clear();
+ //pair<char, char> pa;
+ char first, second;
+ while(begin != end) {
+ //pa = uint16ToChar2(*begin);
+ first = ((*begin)>>8) & 0x00ff;
+ second = (*begin) & 0x00ff;
+ if(first & 0x80) {
+ res += first;
+ res += second;
+ } else {
+ res += second;
+ }
+ begin++;
+ }
+}
+
+/*
+ * format example: "%Y-%m-%d %H:%M:%S"
+ */
+inline void GetTime(const string& format, string& timeStr) {
+ time_t timeNow;
+ time(&timeNow);
+
+ struct tm tmNow;
+
+ #if defined(_WIN32) || defined(_WIN64)
+ errno_t e = localtime_s(&tmNow, &timeNow);
+ assert(e = 0);
+ #else
+ struct tm * tm_tmp = localtime_r(&timeNow, &tmNow);
+ assert(tm_tmp != nullptr);
+ #endif
+
+ timeStr.resize(64);
+
+ size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), &tmNow);
+
+ timeStr.resize(len);
+}
+
+inline string PathJoin(const string& path1, const string& path2) {
+ if(EndsWith(path1, "/")) {
+ return path1 + path2;
+ }
+ return path1 + "/" + path2;
+}
+
+}
+#endif
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp
new file mode 100644
index 0000000..4e3c084
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp
@@ -0,0 +1,44 @@
+#ifndef LIMONP_THREAD_HPP
+#define LIMONP_THREAD_HPP
+
+#include "Logging.hpp"
+#include "NonCopyable.hpp"
+
+namespace limonp {
+
+class IThread: NonCopyable {
+ public:
+ IThread(): isStarted(false), isJoined(false) {
+ }
+ virtual ~IThread() {
+ if(isStarted && !isJoined) {
+ XCHECK(!pthread_detach(thread_));
+ }
+ };
+
+ virtual void Run() = 0;
+ void Start() {
+ XCHECK(!isStarted);
+ XCHECK(!pthread_create(&thread_, NULL, Worker, this));
+ isStarted = true;
+ }
+ void Join() {
+ XCHECK(!isJoined);
+ XCHECK(!pthread_join(thread_, NULL));
+ isJoined = true;
+ }
+ private:
+ static void * Worker(void * data) {
+ IThread * ptr = (IThread* ) data;
+ ptr->Run();
+ return NULL;
+ }
+
+ pthread_t thread_;
+ bool isStarted;
+ bool isJoined;
+}; // class IThread
+
+} // namespace limonp
+
+#endif // LIMONP_THREAD_HPP
diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp
new file mode 100644
index 0000000..fb0ee57
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp
@@ -0,0 +1,86 @@
+#ifndef LIMONP_THREAD_POOL_HPP
+#define LIMONP_THREAD_POOL_HPP
+
+#include "Thread.hpp"
+#include "BlockingQueue.hpp"
+#include "BoundedBlockingQueue.hpp"
+#include "Closure.hpp"
+
+namespace limonp {
+
+using namespace std;
+
+//class ThreadPool;
+class ThreadPool: NonCopyable {
+ public:
+ class Worker: public IThread {
+ public:
+ Worker(ThreadPool* pool): ptThreadPool_(pool) {
+ assert(ptThreadPool_);
+ }
+ virtual ~Worker() {
+ }
+
+ virtual void Run() {
+ while (true) {
+ ClosureInterface* closure = ptThreadPool_->queue_.Pop();
+ if (closure == NULL) {
+ break;
+ }
+ try {
+ closure->Run();
+ } catch(std::exception& e) {
+ XLOG(ERROR) << e.what();
+ } catch(...) {
+ XLOG(ERROR) << " unknown exception.";
+ }
+ delete closure;
+ }
+ }
+ private:
+ ThreadPool * ptThreadPool_;
+ }; // class Worker
+
+ ThreadPool(size_t thread_num)
+ : threads_(thread_num),
+ queue_(thread_num) {
+ assert(thread_num);
+ for(size_t i = 0; i < threads_.size(); i ++) {
+ threads_[i] = new Worker(this);
+ }
+ }
+ ~ThreadPool() {
+ Stop();
+ }
+
+ void Start() {
+ for(size_t i = 0; i < threads_.size(); i++) {
+ threads_[i]->Start();
+ }
+ }
+ void Stop() {
+ for(size_t i = 0; i < threads_.size(); i ++) {
+ queue_.Push(NULL);
+ }
+ for(size_t i = 0; i < threads_.size(); i ++) {
+ threads_[i]->Join();
+ delete threads_[i];
+ }
+ threads_.clear();
+ }
+
+ void Add(ClosureInterface* task) {
+ assert(task);
+ queue_.Push(task);
+ }
+
+ private:
+ friend class Worker;
+
+ vector<IThread*> threads_;
+ BoundedBlockingQueue<ClosureInterface*> queue_;
+}; // class ThreadPool
+
+} // namespace limonp
+
+#endif // LIMONP_THREAD_POOL_HPP
diff --git a/funasr/runtime/websocket/CMakeLists.txt b/funasr/runtime/websocket/CMakeLists.txt
index 7291172..56c4255 100644
--- a/funasr/runtime/websocket/CMakeLists.txt
+++ b/funasr/runtime/websocket/CMakeLists.txt
@@ -111,6 +111,8 @@
include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/include/)
include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp/include/)
include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank)
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/jieba/include)
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/jieba/include/limonp/include)
add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp yaml-cpp)
add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank/kaldi-native-fbank/csrc csrc)
--
Gitblit v1.9.1