c++ runtime adapt to 1.0 (#1724)
* adapt vad runtime to 1.0
* add json
* change yml name
* add func LoadVocabFromJson
* add token file for InitAsr
* add token path for OfflineStream
* add funcOpenYaml
* add token file for InitPunc
* add token file for stream
* update punc-model
* update funasr-wss-server
| | |
| | | message("Little endian system") |
| | | endif() |
| | | |
| | | # json |
| | | include(FetchContent) |
| | | if(NOT EXISTS ${PROJECT_SOURCE_DIR}/third_party/json/ChangeLog.md ) |
| | | FetchContent_Declare(json |
| | | URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.tar.gz |
| | | SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/json |
| | | ) |
| | | |
| | | FetchContent_MakeAvailable(json) |
| | | endif() |
| | | |
| | | # for onnxruntime |
| | | IF(WIN32) |
| | | file(REMOVE ${PROJECT_SOURCE_DIR}/third_party/glog/src/config.h |
| | |
| | | include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include) |
| | | include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include/limonp/include) |
| | | include_directories(${PROJECT_SOURCE_DIR}/third_party/kaldi) |
| | | include_directories(${PROJECT_SOURCE_DIR}/third_party/json/include) |
| | | |
| | | if(ENABLE_GLOG) |
| | | include_directories(${PROJECT_SOURCE_DIR}/third_party/glog/src) |
| | |
| | | // hotword embedding compile model |
| | | #define MODEL_EB_NAME "model_eb.onnx" |
| | | #define QUANT_MODEL_NAME "model_quant.onnx" |
| | | #define VAD_CMVN_NAME "vad.mvn" |
| | | #define VAD_CONFIG_NAME "vad.yaml" |
| | | #define VAD_CMVN_NAME "am.mvn" |
| | | #define VAD_CONFIG_NAME "config.yaml" |
| | | #define AM_CMVN_NAME "am.mvn" |
| | | #define AM_CONFIG_NAME "config.yaml" |
| | | #define LM_CONFIG_NAME "config.yaml" |
| | | #define PUNC_CONFIG_NAME "punc.yaml" |
| | | #define PUNC_CONFIG_NAME "config.yaml" |
| | | #define MODEL_SEG_DICT "seg_dict" |
| | | #define TOKEN_PATH "tokens.json" |
| | | #define HOTWORD "hotword" |
| | | // #define NN_HOTWORD "nn-hotword" |
| | | |
| | |
| | | virtual void StartUtterance() = 0; |
| | | virtual void EndUtterance() = 0; |
| | | virtual void Reset() = 0; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){}; |
| | | virtual void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){}; |
| | | virtual void InitLm(const std::string &lm_file, const std::string &lm_config, const std::string &lex_file){}; |
| | | virtual void InitFstDecoder(){}; |
| | | virtual std::string Forward(float *din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr){return "";}; |
| | |
| | | class PuncModel { |
| | | public: |
| | | virtual ~PuncModel(){}; |
| | | virtual void InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num)=0; |
| | | virtual void InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num)=0; |
| | | virtual std::string AddPunc(const char* sz_input, std::string language="zh-cn"){return "";}; |
| | | virtual std::string AddPunc(const char* sz_input, std::vector<std::string>& arr_cache, std::string language="zh-cn"){return "";}; |
| | | }; |
| | |
| | | { |
| | | } |
| | | |
| | | void CTTransformerOnline::InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num){ |
| | | void CTTransformerOnline::InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num){ |
| | | session_options.SetIntraOpNumThreads(thread_num); |
| | | session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL); |
| | | session_options.DisableCpuMemArena(); |
| | |
| | | for (auto& item : m_strOutputNames) |
| | | m_szOutputNames.push_back(item.c_str()); |
| | | |
| | | m_tokenizer.OpenYaml(punc_config.c_str()); |
| | | m_tokenizer.OpenYaml(punc_config.c_str(), token_file.c_str()); |
| | | } |
| | | |
| | | CTTransformerOnline::~CTTransformerOnline() |
| | |
| | | public: |
| | | |
| | | CTTransformerOnline(); |
| | | void InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num); |
| | | void InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num); |
| | | ~CTTransformerOnline(); |
| | | vector<int> Infer(vector<int32_t> input_data, int nCacheSize); |
| | | string AddPunc(const char* sz_input, vector<string> &arr_cache, std::string language="zh-cn"); |
| | |
| | | { |
| | | } |
| | | |
| | | void CTTransformer::InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num){ |
| | | void CTTransformer::InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num){ |
| | | session_options.SetIntraOpNumThreads(thread_num); |
| | | session_options.SetGraphOptimizationLevel(ORT_ENABLE_ALL); |
| | | session_options.DisableCpuMemArena(); |
| | |
| | | for (auto& item : m_strOutputNames) |
| | | m_szOutputNames.push_back(item.c_str()); |
| | | |
| | | m_tokenizer.OpenYaml(punc_config.c_str()); |
| | | m_tokenizer.OpenYaml(punc_config.c_str(), token_file.c_str()); |
| | | m_tokenizer.JiebaInit(punc_config); |
| | | } |
| | | |
| | |
| | | public: |
| | | |
| | | CTTransformer(); |
| | | void InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num); |
| | | void InitPunc(const std::string &punc_model, const std::string &punc_config, const std::string &token_file, int thread_num); |
| | | ~CTTransformer(); |
| | | vector<int> Infer(vector<int32_t> input_data); |
| | | string AddPunc(const char* sz_input, std::string language="zh-cn"); |
| | |
| | | |
| | | try{ |
| | | YAML::Node frontend_conf = config["frontend_conf"]; |
| | | YAML::Node post_conf = config["vad_post_conf"]; |
| | | YAML::Node post_conf = config["model_conf"]; |
| | | |
| | | this->vad_sample_rate_ = frontend_conf["fs"].as<int>(); |
| | | this->vad_silence_duration_ = post_conf["max_end_silence_time"].as<int>(); |
| | |
| | | string am_model_path; |
| | | string am_cmvn_path; |
| | | string am_config_path; |
| | | string token_path; |
| | | |
| | | am_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME); |
| | | if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){ |
| | |
| | | } |
| | | am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME); |
| | | am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | Model *mm; |
| | | mm = new Paraformer(); |
| | | mm->InitAsr(am_model_path, am_cmvn_path, am_config_path, thread_num); |
| | | mm->InitAsr(am_model_path, am_cmvn_path, am_config_path, token_path, thread_num); |
| | | return mm; |
| | | }else if(type == ASR_ONLINE){ |
| | | // online |
| | |
| | | string de_model_path; |
| | | string am_cmvn_path; |
| | | string am_config_path; |
| | | string token_path; |
| | | |
| | | en_model_path = PathAppend(model_path.at(MODEL_DIR), ENCODER_NAME); |
| | | de_model_path = PathAppend(model_path.at(MODEL_DIR), DECODER_NAME); |
| | |
| | | } |
| | | am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME); |
| | | am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | Model *mm; |
| | | mm = new Paraformer(); |
| | | mm->InitAsr(en_model_path, de_model_path, am_cmvn_path, am_config_path, thread_num); |
| | | mm->InitAsr(en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num); |
| | | return mm; |
| | | }else{ |
| | | LOG(ERROR)<<"Wrong ASR_TYPE : " << type; |
| | |
| | | string am_model_path; |
| | | string am_cmvn_path; |
| | | string am_config_path; |
| | | string token_path; |
| | | string hw_compile_model_path; |
| | | string seg_dict_path; |
| | | |
| | |
| | | } |
| | | am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME); |
| | | am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | asr_handle->InitAsr(am_model_path, am_cmvn_path, am_config_path, thread_num); |
| | | asr_handle->InitAsr(am_model_path, am_cmvn_path, am_config_path, token_path, thread_num); |
| | | } |
| | | |
| | | // Lm resource |
| | |
| | | if(model_path.find(PUNC_DIR) != model_path.end()){ |
| | | string punc_model_path; |
| | | string punc_config_path; |
| | | string token_path; |
| | | |
| | | punc_model_path = PathAppend(model_path.at(PUNC_DIR), MODEL_NAME); |
| | | if(model_path.find(PUNC_QUANT) != model_path.end() && model_path.at(PUNC_QUANT) == "true"){ |
| | | punc_model_path = PathAppend(model_path.at(PUNC_DIR), QUANT_MODEL_NAME); |
| | | } |
| | | punc_config_path = PathAppend(model_path.at(PUNC_DIR), PUNC_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(PUNC_DIR), TOKEN_PATH); |
| | | |
| | | if (access(punc_model_path.c_str(), F_OK) != 0 || |
| | | access(punc_config_path.c_str(), F_OK) != 0 ) |
| | | access(punc_config_path.c_str(), F_OK) != 0 || |
| | | access(token_path.c_str(), F_OK) != 0) |
| | | { |
| | | LOG(INFO) << "PUNC model file is not exist, skip load punc model."; |
| | | }else{ |
| | | punc_handle = make_unique<CTTransformer>(); |
| | | punc_handle->InitPunc(punc_model_path, punc_config_path, thread_num); |
| | | punc_handle->InitPunc(punc_model_path, punc_config_path, token_path, thread_num); |
| | | use_punc = true; |
| | | } |
| | | } |
| | |
| | | } |
| | | |
| | | // offline |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){ |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | LoadConfigFromYaml(am_config.c_str()); |
| | | // knf options |
| | | fbank_opts_.frame_opts.dither = 0; |
| | |
| | | m_szInputNames.push_back(item.c_str()); |
| | | for (auto& item : m_strOutputNames) |
| | | m_szOutputNames.push_back(item.c_str()); |
| | | vocab = new Vocab(am_config.c_str()); |
| | | phone_set_ = new PhoneSet(am_config.c_str()); |
| | | vocab = new Vocab(token_file.c_str()); |
| | | phone_set_ = new PhoneSet(token_file.c_str()); |
| | | LoadCmvn(am_cmvn.c_str()); |
| | | } |
| | | |
| | | // online |
| | | void Paraformer::InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){ |
| | | void Paraformer::InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | |
| | | LoadOnlineConfigFromYaml(am_config.c_str()); |
| | | // knf options |
| | |
| | | for (auto& item : de_strOutputNames) |
| | | de_szOutputNames_.push_back(item.c_str()); |
| | | |
| | | vocab = new Vocab(am_config.c_str()); |
| | | phone_set_ = new PhoneSet(am_config.c_str()); |
| | | vocab = new Vocab(token_file.c_str()); |
| | | phone_set_ = new PhoneSet(token_file.c_str()); |
| | | LoadCmvn(am_cmvn.c_str()); |
| | | } |
| | | |
| | | // 2pass |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){ |
| | | void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){ |
| | | // online |
| | | InitAsr(en_model, de_model, am_cmvn, am_config, thread_num); |
| | | InitAsr(en_model, de_model, am_cmvn, am_config, token_file, thread_num); |
| | | |
| | | // offline |
| | | try { |
| | |
| | | public: |
| | | Paraformer(); |
| | | ~Paraformer(); |
| | | void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num); |
| | | void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | // online |
| | | void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num); |
| | | void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | // 2pass |
| | | void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, int thread_num); |
| | | void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num); |
| | | void InitHwCompiler(const std::string &hw_model, int thread_num); |
| | | void InitSegDict(const std::string &seg_dict_model); |
| | | std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords); |
| | |
| | | namespace funasr { |
| | | PhoneSet::PhoneSet(const char *filename) { |
| | | ifstream in(filename); |
| | | LoadPhoneSetFromYaml(filename); |
| | | LoadPhoneSetFromJson(filename); |
| | | } |
| | | PhoneSet::~PhoneSet() |
| | | { |
| | |
| | | } |
| | | } |
| | | |
| | | void PhoneSet::LoadPhoneSetFromJson(const char* filename) { |
| | | nlohmann::json json_array; |
| | | std::ifstream file(filename); |
| | | if (file.is_open()) { |
| | | file >> json_array; |
| | | file.close(); |
| | | } else { |
| | | LOG(INFO) << "Error loading token file, token file error or not exist."; |
| | | exit(-1); |
| | | } |
| | | |
| | | int id = 0; |
| | | for (const auto& element : json_array) { |
| | | phone_.push_back(element); |
| | | phn2Id_.emplace(element, id); |
| | | id++; |
| | | } |
| | | } |
| | | |
| | | int PhoneSet::Size() const { |
| | | return phone_.size(); |
| | | } |
| | |
| | | #include <string> |
| | | #include <vector> |
| | | #include <unordered_map> |
| | | #include "nlohmann/json.hpp" |
| | | #define UNIT_BEG_SIL_SYMBOL "<s>" |
| | | #define UNIT_END_SIL_SYMBOL "</s>" |
| | | #define UNIT_BLK_SYMBOL "<blank>" |
| | |
| | | vector<string> phone_; |
| | | unordered_map<string, int> phn2Id_; |
| | | void LoadPhoneSetFromYaml(const char* filename); |
| | | void LoadPhoneSetFromJson(const char* filename); |
| | | }; |
| | | |
| | | } // namespace funasr |
| | |
| | | } |
| | | string punc_model_path; |
| | | string punc_config_path; |
| | | string token_file; |
| | | |
| | | punc_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME); |
| | | if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){ |
| | | punc_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME); |
| | | } |
| | | punc_config_path = PathAppend(model_path.at(MODEL_DIR), PUNC_CONFIG_NAME); |
| | | token_file = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | mm->InitPunc(punc_model_path, punc_config_path, thread_num); |
| | | mm->InitPunc(punc_model_path, punc_config_path, token_file, thread_num); |
| | | return mm; |
| | | } |
| | | |
| | |
| | | return m_ready; |
| | | } |
| | | |
| | | bool CTokenizer::OpenYaml(const char* sz_yamlfile, const char* token_file) |
| | | { |
| | | YAML::Node m_Config; |
| | | try{ |
| | | m_Config = YAML::LoadFile(sz_yamlfile); |
| | | }catch(exception const &e){ |
| | | LOG(INFO) << "Error loading file, yaml file error or not exist."; |
| | | exit(-1); |
| | | } |
| | | |
| | | try |
| | | { |
| | | YAML::Node conf_seg_jieba = m_Config["seg_jieba"]; |
| | | if (conf_seg_jieba.IsDefined()){ |
| | | seg_jieba = conf_seg_jieba.as<bool>(); |
| | | } |
| | | |
| | | auto Puncs = m_Config["model_conf"]["punc_list"]; |
| | | if (Puncs.IsSequence()) |
| | | { |
| | | for (size_t i = 0; i < Puncs.size(); ++i) |
| | | { |
| | | if (Puncs[i].IsScalar()) |
| | | { |
| | | m_id2punc.push_back(Puncs[i].as<string>()); |
| | | m_punc2id.insert(make_pair<string, int>(Puncs[i].as<string>(), i)); |
| | | } |
| | | } |
| | | } |
| | | |
| | | nlohmann::json json_array; |
| | | std::ifstream file(token_file); |
| | | if (file.is_open()) { |
| | | file >> json_array; |
| | | file.close(); |
| | | } else { |
| | | LOG(INFO) << "Error loading token file, token file error or not exist."; |
| | | return false; |
| | | } |
| | | |
| | | int i = 0; |
| | | for (const auto& element : json_array) { |
| | | m_id2token.push_back(element); |
| | | m_token2id[element] = i; |
| | | i++; |
| | | } |
| | | } |
| | | catch (YAML::BadFile& e) { |
| | | LOG(ERROR) << "Read error!"; |
| | | return false; |
| | | } |
| | | m_ready = true; |
| | | return m_ready; |
| | | } |
| | | |
| | | vector<string> CTokenizer::Id2String(vector<int> input) |
| | | { |
| | | vector<string> result; |
| | |
| | | #include "cppjieba/DictTrie.hpp" |
| | | #include "cppjieba/HMMModel.hpp" |
| | | #include "cppjieba/Jieba.hpp" |
| | | #include "nlohmann/json.hpp" |
| | | |
| | | namespace funasr { |
| | | class CTokenizer { |
| | |
| | | CTokenizer(); |
| | | ~CTokenizer(); |
| | | bool OpenYaml(const char* sz_yamlfile); |
| | | bool OpenYaml(const char* sz_yamlfile, const char* token_file); |
| | | void ReadYaml(const YAML::Node& node); |
| | | vector<string> Id2String(vector<int> input); |
| | | vector<int> String2Ids(vector<string> input); |
| | |
| | | string de_model_path; |
| | | string am_cmvn_path; |
| | | string am_config_path; |
| | | string token_path; |
| | | string hw_compile_model_path; |
| | | string seg_dict_path; |
| | | |
| | |
| | | } |
| | | am_cmvn_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CMVN_NAME); |
| | | am_config_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH); |
| | | |
| | | asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, thread_num); |
| | | asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num); |
| | | }else{ |
| | | LOG(ERROR) <<"Can not find offline-model-dir or online-model-dir"; |
| | | exit(-1); |
| | |
| | | if(model_path.find(PUNC_DIR) != model_path.end()){ |
| | | string punc_model_path; |
| | | string punc_config_path; |
| | | string token_path; |
| | | |
| | | punc_model_path = PathAppend(model_path.at(PUNC_DIR), MODEL_NAME); |
| | | if(model_path.find(PUNC_QUANT) != model_path.end() && model_path.at(PUNC_QUANT) == "true"){ |
| | | punc_model_path = PathAppend(model_path.at(PUNC_DIR), QUANT_MODEL_NAME); |
| | | } |
| | | punc_config_path = PathAppend(model_path.at(PUNC_DIR), PUNC_CONFIG_NAME); |
| | | token_path = PathAppend(model_path.at(PUNC_DIR), TOKEN_PATH); |
| | | |
| | | if (access(punc_model_path.c_str(), F_OK) != 0 || |
| | | access(punc_config_path.c_str(), F_OK) != 0 ) |
| | | access(punc_config_path.c_str(), F_OK) != 0 || |
| | | access(token_path.c_str(), F_OK) != 0) |
| | | { |
| | | LOG(INFO) << "PUNC model file is not exist, skip load punc model."; |
| | | }else{ |
| | | punc_online_handle = make_unique<CTTransformerOnline>(); |
| | | punc_online_handle->InitPunc(punc_model_path, punc_config_path, thread_num); |
| | | punc_online_handle->InitPunc(punc_model_path, punc_config_path, token_path, thread_num); |
| | | use_punc = true; |
| | | } |
| | | } |
| | |
| | | Vocab::Vocab(const char *filename) |
| | | { |
| | | ifstream in(filename); |
| | | LoadVocabFromYaml(filename); |
| | | LoadVocabFromJson(filename); |
| | | } |
| | | Vocab::Vocab(const char *filename, const char *lex_file) |
| | | { |
| | |
| | | } |
| | | } |
| | | |
| | | void Vocab::LoadVocabFromJson(const char* filename){ |
| | | nlohmann::json json_array; |
| | | std::ifstream file(filename); |
| | | if (file.is_open()) { |
| | | file >> json_array; |
| | | file.close(); |
| | | } else { |
| | | LOG(INFO) << "Error loading token file, token file error or not exist."; |
| | | exit(-1); |
| | | } |
| | | |
| | | int i = 0; |
| | | for (const auto& element : json_array) { |
| | | vocab.push_back(element); |
| | | token_id[element] = i; |
| | | i++; |
| | | } |
| | | } |
| | | |
| | | void Vocab::LoadLex(const char* filename){ |
| | | std::ifstream file(filename); |
| | | std::string line; |
| | |
| | | #include <string> |
| | | #include <vector> |
| | | #include <map> |
| | | #include "nlohmann/json.hpp" |
| | | using namespace std; |
| | | |
| | | namespace funasr { |
| | |
| | | std::map<string, string> lex_map; |
| | | bool IsEnglish(string ch); |
| | | void LoadVocabFromYaml(const char* filename); |
| | | void LoadVocabFromJson(const char* filename); |
| | | void LoadLex(const char* filename); |
| | | |
| | | public: |
| | |
| | | |
| | | TCLAP::ValueArg<std::string> offline_model_revision( |
| | | "", "offline-model-revision", "ASR offline model revision", false, |
| | | "v1.2.1", "string"); |
| | | "v2.0.4", "string"); |
| | | |
| | | TCLAP::ValueArg<std::string> online_model_revision( |
| | | "", "online-model-revision", "ASR online model revision", false, |
| | | "v1.0.6", "string"); |
| | | "v2.0.4", "string"); |
| | | |
| | | TCLAP::ValueArg<std::string> quantize( |
| | | "", QUANTIZE, |
| | |
| | | "model_quant.onnx, vad.yaml, vad.mvn", |
| | | false, "damo/speech_fsmn_vad_zh-cn-16k-common-onnx", "string"); |
| | | TCLAP::ValueArg<std::string> vad_revision( |
| | | "", "vad-revision", "VAD model revision", false, "v1.2.0", "string"); |
| | | "", "vad-revision", "VAD model revision", false, "v2.0.4", "string"); |
| | | TCLAP::ValueArg<std::string> vad_quant( |
| | | "", VAD_QUANT, |
| | | "true (Default), load the model of model_quant.onnx in vad_dir. If set " |
| | |
| | | "model_quant.onnx, punc.yaml", |
| | | false, "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727-onnx", "string"); |
| | | TCLAP::ValueArg<std::string> punc_revision( |
| | | "", "punc-revision", "PUNC model revision", false, "v1.0.2", "string"); |
| | | "", "punc-revision", "PUNC model revision", false, "v2.0.4", "string"); |
| | | TCLAP::ValueArg<std::string> punc_quant( |
| | | "", PUNC_QUANT, |
| | | "true (Default), load the model of model_quant.onnx in punc_dir. If " |
| | |
| | | |
| | | size_t found = s_offline_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.2.4"; |
| | | model_path["offline-model-revision"]="v2.0.4"; |
| | | } |
| | | |
| | | found = s_offline_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["offline-model-revision"]="v1.0.5"; |
| | | model_path["offline-model-revision"]="v2.0.5"; |
| | | } |
| | | |
| | | found = s_offline_asr_path.find("speech_paraformer-large_asr_nat-en-16k-common-vocab10020"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.0"; |
| | | model_path["model-revision"]="v2.0.4"; |
| | | s_itn_path=""; |
| | | s_lm_path=""; |
| | | } |
| | |
| | | TCLAP::ValueArg<std::string> model_revision( |
| | | "", "model-revision", |
| | | "ASR model revision", |
| | | false, "v1.2.1", "string"); |
| | | false, "v2.0.4", "string"); |
| | | TCLAP::ValueArg<std::string> quantize( |
| | | "", QUANTIZE, |
| | | "true (Default), load the model of model_quant.onnx in model_dir. If set " |
| | |
| | | TCLAP::ValueArg<std::string> vad_revision( |
| | | "", "vad-revision", |
| | | "VAD model revision", |
| | | false, "v1.2.0", "string"); |
| | | false, "v2.0.4", "string"); |
| | | TCLAP::ValueArg<std::string> vad_quant( |
| | | "", VAD_QUANT, |
| | | "true (Default), load the model of model_quant.onnx in vad_dir. If set " |
| | |
| | | TCLAP::ValueArg<std::string> punc_revision( |
| | | "", "punc-revision", |
| | | "PUNC model revision", |
| | | false, "v1.1.7", "string"); |
| | | false, "v2.0.4", "string"); |
| | | TCLAP::ValueArg<std::string> punc_quant( |
| | | "", PUNC_QUANT, |
| | | "true (Default), load the model of model_quant.onnx in punc_dir. If set " |
| | |
| | | // modify model-revision by model name |
| | | size_t found = s_asr_path.find("speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.2.4"; |
| | | model_path["model-revision"]="v2.0.4"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.5"; |
| | | model_path["model-revision"]="v2.0.5"; |
| | | } |
| | | |
| | | found = s_asr_path.find("speech_paraformer-large_asr_nat-en-16k-common-vocab10020"); |
| | | if (found != std::string::npos) { |
| | | model_path["model-revision"]="v1.0.0"; |
| | | model_path["model-revision"]="v2.0.4"; |
| | | s_itn_path=""; |
| | | s_lm_path=""; |
| | | } |