| | |
| | | Vocab::Vocab(const char *filename) |
| | | { |
| | | ifstream in(filename); |
| | | LoadVocabFromJson(filename); |
| | | } |
| | | Vocab::Vocab(const char *filename, const char *lex_file) |
| | | { |
| | | ifstream in(filename); |
| | | LoadVocabFromYaml(filename); |
| | | LoadLex(lex_file); |
| | | } |
| | | Vocab::~Vocab() |
| | | { |
| | |
| | | } |
| | | } |
| | | |
| | | int Vocab::GetIdByToken(const std::string &token) { |
| | | if (token_id.count(token)) { |
| | | return token_id[token]; |
| | | void Vocab::LoadVocabFromJson(const char* filename){ |
| | | nlohmann::json json_array; |
| | | std::ifstream file(filename); |
| | | if (file.is_open()) { |
| | | file >> json_array; |
| | | file.close(); |
| | | } else { |
| | | LOG(INFO) << "Error loading token file, token file error or not exist."; |
| | | exit(-1); |
| | | } |
| | | return 0; |
| | | |
| | | int i = 0; |
| | | for (const auto& element : json_array) { |
| | | vocab.push_back(element); |
| | | token_id[element] = i; |
| | | i++; |
| | | } |
| | | } |
| | | |
| | | void Vocab::LoadLex(const char* filename){ |
| | | std::ifstream file(filename); |
| | | std::string line; |
| | | while (std::getline(file, line)) { |
| | | std::string key, value; |
| | | std::istringstream iss(line); |
| | | std::getline(iss, key, '\t'); |
| | | std::getline(iss, value); |
| | | |
| | | if (!key.empty() && !value.empty()) { |
| | | lex_map[key] = value; |
| | | } |
| | | } |
| | | |
| | | file.close(); |
| | | } |
| | | |
| | | string Vocab::Word2Lex(const std::string &word) const { |
| | | auto it = lex_map.find(word); |
| | | if (it != lex_map.end()) { |
| | | return it->second; |
| | | } |
| | | return ""; |
| | | } |
| | | |
| | | int Vocab::GetIdByToken(const std::string &token) const { |
| | | auto it = token_id.find(token); |
| | | if (it != token_id.end()) { |
| | | return it->second; |
| | | } |
| | | return -1; |
| | | } |
| | | |
| | | void Vocab::Vector2String(vector<int> in, std::vector<std::string> &preds) |