From 94de39dde2e616a01683c518023d0fab72b4e103 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 19 二月 2024 22:21:50 +0800
Subject: [PATCH] aishell example

---
 runtime/onnxruntime/src/util.cpp |   34 +++++++++++++++++++++-------------
 1 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/runtime/onnxruntime/src/util.cpp b/runtime/onnxruntime/src/util.cpp
index ac793f5..a12570b 100644
--- a/runtime/onnxruntime/src/util.cpp
+++ b/runtime/onnxruntime/src/util.cpp
@@ -305,6 +305,10 @@
 }
 
 bool TimestampIsPunctuation(U16CHAR_T &u16) {
+    // (& ' -) in the dict
+    if (u16 == 0x26 || u16 == 0x27 || u16 == 0x2D){
+        return false;
+    }
     return (u16 >= 0x21 && u16 <= 0x2F)     // 鏍囧噯ASCII鏍囩偣
         || (u16 >= 0x3A && u16 <= 0x40)     // 鏍囧噯ASCII鏍囩偣
         || (u16 >= 0x5B && u16 <= 0x60)     // 鏍囧噯ASCII鏍囩偣
@@ -361,9 +365,13 @@
   }
 }
 
-std::string VectorToString(const std::vector<std::vector<int>>& vec) {
+std::string VectorToString(const std::vector<std::vector<int>>& vec, bool out_empty) {
     if(vec.size() == 0){
-        return "";
+        if(out_empty){
+            return "";
+        }else{
+            return "[]";
+        }
     }
     std::ostringstream out;
     out << "[";
@@ -580,19 +588,18 @@
                 }
             }
             // format
-            ts_sent += "{'text':'" + text_seg + "',";
-            ts_sent += "'start':'" + to_string(start) + "',";
-            ts_sent += "'end':'" + to_string(end) + "',";
-            ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}";
+            ts_sent += "{\"text_seg\":\"" + text_seg + "\",";
+            ts_sent += "\"punc\":\"" + characters[idx_str] + "\",";
+            ts_sent += "\"start\":" + to_string(start) + ",";
+            ts_sent += "\"end\":" + to_string(end) + ",";
+            ts_sent += "\"ts_list\":" + VectorToString(ts_seg, false) + "}";
             
             if (idx_str == characters.size()-1){
                 ts_sentences += ts_sent;
             } else{
                 ts_sentences += ts_sent + ",";
             }
-
             // clear
-            idx_str++;
             text_seg = "";
             ts_sent = "";
             start = 0;
@@ -605,9 +612,9 @@
                 text_seg += " " + characters[idx_str];
             }
             ts_seg.push_back(timestamps[idx_ts]);
-            idx_str++;
             idx_ts++;
         }
+        idx_str++;
     }
     // for none punc results
     if(ts_seg.size() >0){
@@ -618,10 +625,11 @@
             end = ts_seg[ts_seg.size()-1][1];
         }
         // format
-        ts_sent += "{'text':'" + text_seg + "',";
-        ts_sent += "'start':'" + to_string(start) + "',";
-        ts_sent += "'end':'" + to_string(end) + "',";
-        ts_sent += "'ts_list':" + VectorToString(ts_seg) + "}";
+        ts_sent += "{\"text_seg\":\"" + text_seg + "\",";
+        ts_sent += "\"punc\":\"\",";
+        ts_sent += "\"start\":" + to_string(start) + ",";
+        ts_sent += "\"end\":" + to_string(end) + ",";
+        ts_sent += "\"ts_list\":" + VectorToString(ts_seg, false) + "}";
         ts_sentences += ts_sent;
     }
 

--
Gitblit v1.9.1