| | |
| | | } |
| | | |
| | | bool TimestampIsPunctuation(const std::string& str) { |
| | | const std::string punctuation = u8",。?、,.?"; |
| | | const std::string punctuation = u8",。?、,?"; |
| | | // const std::string punctuation = u8",。?、,.?"; |
| | | for (char ch : str) { |
| | | if (punctuation.find(ch) == std::string::npos) { |
| | | return false; |
| | |
| | | return timestamps_str; |
| | | } |
| | | |
| | | std::string TimestampSentence(std::string &text, std::string &str_time){ |
| | | std::vector<std::string> characters; |
| | | funasr::TimestampSplitChiEngCharacters(text, characters); |
| | | vector<vector<int>> timestamps = funasr::ParseTimestamps(str_time); |
| | | |
| | | int idx_str = 0, idx_ts = 0; |
| | | int start = -1, end = -1; |
| | | std::string text_seg = ""; |
| | | std::string ts_sentences = ""; |
| | | std::string ts_sent = ""; |
| | | vector<vector<int>> ts_seg; |
| | | while(idx_str < characters.size()){ |
| | | if (TimestampIsPunctuation(characters[idx_str])){ |
| | | if(ts_seg.size() >0){ |
| | | if (ts_seg[0].size() == 2){ |
| | | start = ts_seg[0][0]; |
| | | } |
| | | if (ts_seg[ts_seg.size()-1].size() == 2){ |
| | | end = ts_seg[ts_seg.size()-1][1]; |
| | | } |
| | | } |
| | | // format |
| | | ts_sent += "{'text':'" + text_seg + "',"; |
| | | ts_sent += "'start':'" + to_string(start) + "',"; |
| | | ts_sent += "'end':'" + to_string(end) + "',"; |
| | | ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}"; |
| | | |
| | | if (idx_str == characters.size()-1){ |
| | | ts_sentences += ts_sent; |
| | | } else{ |
| | | ts_sentences += ts_sent + ","; |
| | | } |
| | | |
| | | // clear |
| | | idx_str++; |
| | | text_seg = ""; |
| | | ts_sent = ""; |
| | | start = 0; |
| | | end = 0; |
| | | ts_seg.clear(); |
| | | } else if(idx_ts < timestamps.size()) { |
| | | if (text_seg.empty()){ |
| | | text_seg = characters[idx_str]; |
| | | }else{ |
| | | text_seg += " " + characters[idx_str]; |
| | | } |
| | | ts_seg.push_back(timestamps[idx_ts]); |
| | | idx_str++; |
| | | idx_ts++; |
| | | } |
| | | } |
| | | // for none punc results |
| | | if(ts_seg.size() >0){ |
| | | if (ts_seg[0].size() == 2){ |
| | | start = ts_seg[0][0]; |
| | | } |
| | | if (ts_seg[ts_seg.size()-1].size() == 2){ |
| | | end = ts_seg[ts_seg.size()-1][1]; |
| | | } |
| | | // format |
| | | ts_sent += "{'text':'" + text_seg + "',"; |
| | | ts_sent += "'start':'" + to_string(start) + "',"; |
| | | ts_sent += "'end':'" + to_string(end) + "',"; |
| | | ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}"; |
| | | ts_sentences += ts_sent; |
| | | } |
| | | |
| | | return "[" +ts_sentences + "]"; |
| | | } |
| | | |
| | | std::vector<std::string> split(const std::string &s, char delim) { |
| | | std::vector<std::string> elems; |
| | | std::stringstream ss(s); |