From 445b7ec47cbc6ce37c985fdc7e8e55c130f1d391 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期五, 29 三月 2024 16:46:54 +0800
Subject: [PATCH] add batch for offline-stream

---
 runtime/onnxruntime/src/funasrruntime.cpp |  132 ++++++++++++++++++++++++++++++++++----------
 1 files changed, 102 insertions(+), 30 deletions(-)

diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp
index dd6bb17..711eac7 100644
--- a/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/runtime/onnxruntime/src/funasrruntime.cpp
@@ -33,9 +33,9 @@
 		return mm;
 	}
 
-	_FUNASRAPI FUNASR_HANDLE  FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num)
+	_FUNASRAPI FUNASR_HANDLE  FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num, bool use_gpu)
 	{
-		funasr::OfflineStream* mm = funasr::CreateOfflineStream(model_path, thread_num);
+		funasr::OfflineStream* mm = funasr::CreateOfflineStream(model_path, thread_num, use_gpu);
 		return mm;
 	}
 
@@ -57,7 +57,7 @@
 		if (!recog_obj)
 			return nullptr;
 
-		funasr::Audio audio(1);
+		funasr::Audio audio(recog_obj->GetAsrSampleRate(),1);
 		if(wav_format == "pcm" || wav_format == "PCM"){
 			if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
 				return nullptr;
@@ -93,7 +93,7 @@
 		if (!recog_obj)
 			return nullptr;
 
-		funasr::Audio audio(1);
+		funasr::Audio audio(recog_obj->GetAsrSampleRate(),1);
 		if(funasr::is_target_file(sz_filename, "wav")){
 			int32_t sampling_rate_ = -1;
 			if(!audio.LoadWav(sz_filename, &sampling_rate_))
@@ -134,7 +134,7 @@
 		if (!vad_obj)
 			return nullptr;
 
-		funasr::Audio audio(1);
+		funasr::Audio audio(vad_obj->GetVadSampleRate(),1);
 		if(wav_format == "pcm" || wav_format == "PCM"){
 			if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
 				return nullptr;
@@ -146,6 +146,7 @@
 		funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
 		p_result->snippet_time = audio.GetTimeLen();
 		if(p_result->snippet_time == 0){
+			p_result->segments = new vector<std::vector<int>>();
             return p_result;
         }
 		
@@ -162,7 +163,7 @@
 		if (!vad_obj)
 			return nullptr;
 
-		funasr::Audio audio(1);
+		funasr::Audio audio(vad_obj->GetVadSampleRate(),1);
 		if(funasr::is_target_file(sz_filename, "wav")){
 			int32_t sampling_rate_ = -1;
 			if(!audio.LoadWav(sz_filename, &sampling_rate_))
@@ -178,6 +179,7 @@
 		funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
 		p_result->snippet_time = audio.GetTimeLen();
 		if(p_result->snippet_time == 0){
+			p_result->segments = new vector<std::vector<int>>();
             return p_result;
         }
 		
@@ -222,7 +224,7 @@
 		if (!offline_stream)
 			return nullptr;
 
-		funasr::Audio audio(1);
+		funasr::Audio audio(offline_stream->asr_handle->GetAsrSampleRate(),1);
 		try{
 			if(wav_format == "pcm" || wav_format == "PCM"){
 				if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
@@ -243,15 +245,13 @@
             return p_result;
         }
 		if(offline_stream->UseVad()){
-			audio.Split(offline_stream);
+			audio.CutSplit(offline_stream);
 		}
 
 		float* buff;
 		int len;
 		int flag = 0;
 
-		int n_step = 0;
-		int n_total = audio.GetQueueSize();
 		float start_time = 0.0;
 		std::string cur_stamp = "[";
 		std::string lang = (offline_stream->asr_handle)->GetLang();
@@ -266,7 +266,7 @@
 			if(msg_vec.size()==0){
 				continue;
 			}
-			if(lang == "en-bpe" and p_result->msg != ""){
+			if(lang == "en-bpe" && p_result->msg != ""){
 				p_result->msg += " ";
 			}
 			p_result->msg += msg_vec[0];
@@ -279,9 +279,6 @@
 					cur_stamp += "["+std::to_string((int)(1000*begin))+","+std::to_string((int)(1000*end))+"],";
 				}
 			}
-			n_step++;
-			if (fn_callback)
-				fn_callback(n_step, n_total);
 		}
 		if(cur_stamp != "["){
 			cur_stamp.erase(cur_stamp.length() - 1);
@@ -294,10 +291,18 @@
 #if !defined(__APPLE__)
 		if(offline_stream->UseITN() && itn){
 			string msg_itn = offline_stream->itn_handle->Normalize(p_result->msg);
+			if(!(p_result->stamp).empty()){
+				std::string new_stamp = funasr::TimestampSmooth(p_result->msg, msg_itn, p_result->stamp);
+				if(!new_stamp.empty()){
+					p_result->stamp = new_stamp;
+				}
+			}			
 			p_result->msg = msg_itn;
 		}
 #endif
-
+		if (!(p_result->stamp).empty()){
+			p_result->stamp_sents = funasr::TimestampSentence(p_result->msg, p_result->stamp);
+		}
 		return p_result;
 	}
 
@@ -308,7 +313,7 @@
 		if (!offline_stream)
 			return nullptr;
 		
-		funasr::Audio audio(1);
+		funasr::Audio audio((offline_stream->asr_handle)->GetAsrSampleRate(),1);
 		try{
 			if(funasr::is_target_file(sz_filename, "wav")){
 				int32_t sampling_rate_ = -1;
@@ -333,7 +338,7 @@
             return p_result;
         }
 		if(offline_stream->UseVad()){
-			audio.Split(offline_stream);
+			audio.CutSplit(offline_stream);
 		}
 
 		float* buff;
@@ -355,7 +360,7 @@
 			if(msg_vec.size()==0){
 				continue;
 			}
-			if(lang == "en-bpe" and p_result->msg != ""){
+			if(lang == "en-bpe" && p_result->msg != ""){
 				p_result->msg += " ";
 			}
 			p_result->msg += msg_vec[0];
@@ -384,9 +389,18 @@
 #if !defined(__APPLE__)
 		if(offline_stream->UseITN() && itn){
 			string msg_itn = offline_stream->itn_handle->Normalize(p_result->msg);
+			if(!(p_result->stamp).empty()){
+				std::string new_stamp = funasr::TimestampSmooth(p_result->msg, msg_itn, p_result->stamp);
+				if(!new_stamp.empty()){
+					p_result->stamp = new_stamp;
+				}
+			}
 			p_result->msg = msg_itn;
 		}
 #endif
+		if (!(p_result->stamp).empty()){
+			p_result->stamp_sents = funasr::TimestampSentence(p_result->msg, p_result->stamp);
+		}
 		return p_result;
 	}
 
@@ -420,7 +434,7 @@
 	_FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf, 
 												 int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished, 
 												 int sampling_rate, std::string wav_format, ASR_TYPE mode, 
-												 const std::vector<std::vector<float>> &hw_emb, bool itn)
+												 const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle)
 	{
 		funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
 		funasr::TpassOnlineStream* tpass_online_stream = (funasr::TpassOnlineStream*)online_handle;
@@ -461,7 +475,7 @@
 		
 		audio->Split(vad_online_handle, chunk_len, input_finished, mode);
 
-		funasr::AudioFrame* frame = NULL;
+		funasr::AudioFrame* frame = nullptr;
 		while(audio->FetchChunck(frame) > 0){
 			string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
 			if(mode == ASR_ONLINE){
@@ -485,16 +499,21 @@
 			}else if(mode == ASR_TWO_PASS){
 				p_result->msg += msg;
 			}
-			if(frame != NULL){
+			if(frame != nullptr){
 				delete frame;
-				frame = NULL;
+				frame = nullptr;
 			}
 		}
 
 		// timestamp
 		std::string cur_stamp = "[";		
 		while(audio->FetchTpass(frame) > 0){
-			string msg = ((funasr::Paraformer*)asr_handle)->Forward(frame->data, frame->len, frame->is_final, hw_emb);
+			// dec reset
+			funasr::WfstDecoder* wfst_decoder = (funasr::WfstDecoder*)dec_handle;
+			if (wfst_decoder){
+				wfst_decoder->StartUtterance();
+			}
+			string msg = ((funasr::Paraformer*)asr_handle)->Forward(frame->data, frame->len, frame->is_final, hw_emb, dec_handle);
 
 			std::vector<std::string> msg_vec = funasr::split(msg, '|');  // split with timestamp
 			if(msg_vec.size()==0){
@@ -524,13 +543,22 @@
 #if !defined(__APPLE__)
 			if(tpass_stream->UseITN() && itn){
 				string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc);
+				// TimestampSmooth
+				if(!(p_result->stamp).empty()){
+					std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp);
+					if(!new_stamp.empty()){
+						p_result->stamp = new_stamp;
+					}
+				}
 				p_result->tpass_msg = msg_itn;
 			}
 #endif
-
-			if(frame != NULL){
+			if (!(p_result->stamp).empty()){
+				p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp);
+			}
+			if(frame != nullptr){
 				delete frame;
-				frame = NULL;
+				frame = nullptr;
 			}
 		}
 
@@ -582,6 +610,15 @@
 			return nullptr;
 
 		return p_result->stamp.c_str();
+	}
+
+		_FUNASRAPI const char* FunASRGetStampSents(FUNASR_RESULT result)
+	{
+		funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
+		if(!p_result)
+			return nullptr;
+
+		return p_result->stamp_sents.c_str();
 	}
 
 	_FUNASRAPI const char* FunASRGetTpassResult(FUNASR_RESULT result,int n_index)
@@ -725,10 +762,45 @@
 		funasr::WfstDecoder* mm = nullptr;
 		if (asr_type == ASR_OFFLINE) {
 			funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
-			funasr::Paraformer* paraformer = (funasr::Paraformer*)offline_stream->asr_handle.get();
-			if (paraformer->lm_)
-				mm = new funasr::WfstDecoder(paraformer->lm_.get(), 
-					paraformer->GetPhoneSet(), paraformer->GetVocab(), glob_beam, lat_beam, am_scale);
+			auto paraformer = dynamic_cast<funasr::Paraformer*>(offline_stream->asr_handle.get());
+			if(paraformer !=nullptr){
+				if (paraformer->lm_){
+					mm = new funasr::WfstDecoder(paraformer->lm_.get(),
+						paraformer->GetPhoneSet(), paraformer->GetLmVocab(), glob_beam, lat_beam, am_scale);
+				}
+				return mm;
+			}
+			#ifdef USE_GPU
+			auto paraformer_torch = dynamic_cast<funasr::ParaformerTorch*>(offline_stream->asr_handle.get());
+			if(paraformer_torch !=nullptr){
+				if (paraformer_torch->lm_){
+					mm = new funasr::WfstDecoder(paraformer_torch->lm_.get(),
+						paraformer_torch->GetPhoneSet(), paraformer_torch->GetLmVocab(), glob_beam, lat_beam, am_scale);
+				}
+				return mm;
+			}
+			#endif
+
+		} else if (asr_type == ASR_TWO_PASS){
+			funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
+			auto paraformer = dynamic_cast<funasr::Paraformer*>(tpass_stream->asr_handle.get());
+			if(paraformer !=nullptr){
+				if (paraformer->lm_){
+					mm = new funasr::WfstDecoder(paraformer->lm_.get(),
+						paraformer->GetPhoneSet(), paraformer->GetLmVocab(), glob_beam, lat_beam, am_scale);
+				}
+				return mm;
+			}
+			#ifdef USE_GPU
+			auto paraformer_torch = dynamic_cast<funasr::ParaformerTorch*>(tpass_stream->asr_handle.get());
+			if(paraformer_torch !=nullptr){
+				if (paraformer_torch->lm_){
+					mm = new funasr::WfstDecoder(paraformer_torch->lm_.get(),
+						paraformer_torch->GetPhoneSet(), paraformer_torch->GetLmVocab(), glob_beam, lat_beam, am_scale);
+				}
+				return mm;
+			}
+			#endif
 		}
 		return mm;
 	}

--
Gitblit v1.9.1