// Acknowledgement: this code is adapted from // https://github.com/wenet-e2e/WeTextProcessing/blob/master/runtime/processor/token_parser.h // Retrieved in Aug 2023. // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) // 2023 Jing Du (thuduj12@163.com) // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef ITN_TOKEN_PARSER_H_ #define ITN_TOKEN_PARSER_H_ #include #include #include #include namespace funasr { extern const std::string EOS; extern const std::set UTF8_WHITESPACE; extern const std::set ASCII_LETTERS; extern const std::unordered_map> TN_ORDERS; extern const std::unordered_map> ITN_ORDERS; struct Token { std::string name; std::vector order; std::unordered_map members; Token(const std::string& name) : name(name) {} void append(const std::string& key, const std::string& value) { order.emplace_back(key); members[key] = value; } std::string string( const std::unordered_map>& orders) { std::string output = name + " {"; if (orders.count(name) > 0) { order = orders.at(name); } for (const auto& key : order) { if (members.count(key) == 0) { continue; } output += " " + key + ": \"" + members[key] + "\""; } return output + " }"; } }; enum ParseType { kTN = 0x00, // Text Normalization kITN = 0x01 // Inverse Text Normalization }; class TokenParser { public: TokenParser(ParseType type); std::string reorder(const std::string& input); private: void load(const std::string& input); bool read(); bool parse_ws(); bool parse_char(const std::string& exp); bool parse_chars(const std::string& exp); std::string parse_key(); std::string parse_value(); void parse(const std::string& input); int index; std::string ch; std::vector text; std::vector tokens; std::unordered_map> orders; }; } // funasr #endif // ITN_TOKEN_PARSER_H_