liugz18
2024-07-18 d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99
runtime/onnxruntime/src/funasrruntime.cpp
@@ -233,9 +233,13 @@
      if(p_result->snippet_time == 0){
            return p_result;
        }
      std::vector<int> index_vector={0};
      int msg_idx = 0;
      if(offline_stream->UseVad()){
         audio.CutSplit(offline_stream);
         audio.CutSplit(offline_stream, index_vector);
      }
      std::vector<string> msgs(index_vector.size());
      std::vector<float> msg_stimes(index_vector.size());
      float** buff;
      int* len;
@@ -246,33 +250,24 @@
      std::string cur_stamp = "[";
      std::string lang = (offline_stream->asr_handle)->GetLang();
      while (audio.Fetch(buff, len, flag, start_time, batch_size, batch_in) > 0) {
      while (audio.FetchDynamic(buff, len, flag, start_time, batch_size, batch_in) > 0) {
         // dec reset
         funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
         if (wfst_decoder){
            wfst_decoder->StartUtterance();
         }
         vector<string> msgs = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle);
         vector<string> msg_batch = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, batch_in);
         for(int idx=0; idx<batch_in; idx++){
            string msg = msgs[idx];
            std::vector<std::string> msg_vec = funasr::split(msg, '|');
            if(msg_vec.size()==0){
               continue;
            }
            if(lang == "en-bpe" && p_result->msg != ""){
               p_result->msg += " ";
            }
            p_result->msg += msg_vec[0];
            //timestamp
            if(msg_vec.size() > 1){
               std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
               for(int i=0; i<msg_stamp.size()-1; i+=2){
                  float begin = std::stof(msg_stamp[i])+start_time[idx];
                  float end = std::stof(msg_stamp[i+1])+start_time[idx];
                  cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
               }
            }
            string msg = msg_batch[idx];
            if(msg_idx < index_vector.size()){
               msgs[index_vector[msg_idx]] = msg;
               msg_stimes[index_vector[msg_idx]] = start_time[idx];
               msg_idx++;
            }else{
               LOG(ERROR) << "msg_idx: " << msg_idx <<" is out of range " << index_vector.size();
            }
         }
         // release
         delete[] buff;
         buff = nullptr;
@@ -282,6 +277,26 @@
         flag = nullptr;
         delete[] start_time;
         start_time = nullptr;
      }
      for(int idx=0; idx<msgs.size(); idx++){
         string msg = msgs[idx];
         std::vector<std::string> msg_vec = funasr::split(msg, '|');
         if(msg_vec.size()==0){
            continue;
         }
         if(lang == "en-bpe" && p_result->msg != ""){
            p_result->msg += " ";
         }
         p_result->msg += msg_vec[0];
         //timestamp
         if(msg_vec.size() > 1){
            std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
            for(int i=0; i<msg_stamp.size()-1; i+=2){
               float begin = std::stof(msg_stamp[i])+msg_stimes[idx];
               float end = std::stof(msg_stamp[i+1])+msg_stimes[idx];
               cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
            }
         }
      }
      if(cur_stamp != "["){
         cur_stamp.erase(cur_stamp.length() - 1);
@@ -340,9 +355,13 @@
      if(p_result->snippet_time == 0){
            return p_result;
        }
      std::vector<int> index_vector={0};
      int msg_idx = 0;
      if(offline_stream->UseVad()){
         audio.CutSplit(offline_stream);
         audio.CutSplit(offline_stream, index_vector);
      }
      std::vector<string> msgs(index_vector.size());
      std::vector<float> msg_stimes(index_vector.size());
      float** buff;
      int* len;
@@ -353,33 +372,24 @@
      std::string cur_stamp = "[";
      std::string lang = (offline_stream->asr_handle)->GetLang();
      while (audio.Fetch(buff, len, flag, start_time, batch_size, batch_in) > 0) {
      while (audio.FetchDynamic(buff, len, flag, start_time, batch_size, batch_in) > 0) {
         // dec reset
         funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
         if (wfst_decoder){
            wfst_decoder->StartUtterance();
         }
         vector<string> msgs = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle);
         vector<string> msg_batch = (offline_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, batch_in);
         for(int idx=0; idx<batch_in; idx++){
            string msg = msgs[idx];
            std::vector<std::string> msg_vec = funasr::split(msg, '|');
            if(msg_vec.size()==0){
               continue;
            }
            if(lang == "en-bpe" && p_result->msg != ""){
               p_result->msg += " ";
            }
            p_result->msg += msg_vec[0];
            //timestamp
            if(msg_vec.size() > 1){
               std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
               for(int i=0; i<msg_stamp.size()-1; i+=2){
                  float begin = std::stof(msg_stamp[i])+start_time[idx];
                  float end = std::stof(msg_stamp[i+1])+start_time[idx];
                  cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
               }
            }
            string msg = msg_batch[idx];
            if(msg_idx < index_vector.size()){
               msgs[index_vector[msg_idx]] = msg;
               msg_stimes[index_vector[msg_idx]] = start_time[idx];
               msg_idx++;
            }else{
               LOG(ERROR) << "msg_idx: " << msg_idx <<" is out of range " << index_vector.size();
            }
         }
         // release
         delete[] buff;
         buff = nullptr;
@@ -389,6 +399,26 @@
         flag = nullptr;
         delete[] start_time;
         start_time = nullptr;
      }
      for(int idx=0; idx<msgs.size(); idx++){
         string msg = msgs[idx];
         std::vector<std::string> msg_vec = funasr::split(msg, '|');
         if(msg_vec.size()==0){
            continue;
         }
         if(lang == "en-bpe" && p_result->msg != ""){
            p_result->msg += " ";
         }
         p_result->msg += msg_vec[0];
         //timestamp
         if(msg_vec.size() > 1){
            std::vector<std::string> msg_stamp = funasr::split(msg_vec[1], ',');
            for(int i=0; i<msg_stamp.size()-1; i+=2){
               float begin = std::stof(msg_stamp[i])+msg_stimes[idx];
               float end = std::stof(msg_stamp[i+1])+msg_stimes[idx];
               cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
            }
         }
      }
      if(cur_stamp != "["){
         cur_stamp.erase(cur_stamp.length() - 1);
@@ -416,7 +446,7 @@
      return p_result;
   }
#if !defined(__APPLE__)
//#if !defined(__APPLE__)
   _FUNASRAPI const std::vector<std::vector<float>> CompileHotwordEmbedding(FUNASR_HANDLE handle, std::string &hotwords, ASR_TYPE mode)
   {
      if (mode == ASR_OFFLINE){
@@ -440,7 +470,7 @@
      }
      
   }
#endif
//#endif
   // APIs for 2pass-stream Infer
   _FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf,