From b454a1054fadbff0ee963944ff42f66b98317582 Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期二, 08 八月 2023 11:17:43 +0800
Subject: [PATCH] update online runtime, including vad-online, paraformer-online, punc-online,2pass (#815)

---
 funasr/runtime/onnxruntime/src/audio.cpp |  282 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 255 insertions(+), 27 deletions(-)

diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 85633b7..2ba9c30 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -132,40 +132,54 @@
     };
 };
 
-AudioFrame::AudioFrame(){};
+AudioFrame::AudioFrame(){}
 AudioFrame::AudioFrame(int len) : len(len)
 {
     start = 0;
-};
-AudioFrame::~AudioFrame(){};
+}
+AudioFrame::AudioFrame(const AudioFrame &other)
+{
+    start = other.start;
+    end = other.end;
+    len = other.len;
+    is_final = other.is_final;
+}
+AudioFrame::AudioFrame(int start, int end, bool is_final):start(start),end(end),is_final(is_final){
+    len = end - start;
+}
+AudioFrame::~AudioFrame(){
+    if(data != NULL){
+        free(data);
+    }
+}
 int AudioFrame::SetStart(int val)
 {
     start = val < 0 ? 0 : val;
     return start;
-};
+}
 
 int AudioFrame::SetEnd(int val)
 {
     end = val;
     len = end - start;
     return end;
-};
+}
 
 int AudioFrame::GetStart()
 {
     return start;
-};
+}
 
 int AudioFrame::GetLen()
 {
     return len;
-};
+}
 
 int AudioFrame::Disp()
 {
     LOG(ERROR) << "Not imp!!!!";
     return 0;
-};
+}
 
 Audio::Audio(int data_type) : data_type(data_type)
 {
@@ -230,7 +244,7 @@
     copy(samples.begin(), samples.end(), speech_data);
 }
 
-bool Audio::FfmpegLoad(const char *filename){
+bool Audio::FfmpegLoad(const char *filename, bool copy2char){
     // from file
     AVFormatContext* formatContext = avformat_alloc_context();
     if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
@@ -353,8 +367,17 @@
     if (speech_buff != NULL) {
         free(speech_buff);
     }
+    if (speech_char != NULL) {
+        free(speech_char);
+    }
     offset = 0;
     
+    if(copy2char){
+        speech_char = (char *)malloc(resampled_buffers.size());
+        memset(speech_char, 0, resampled_buffers.size());
+        memcpy((void*)speech_char, (const void*)resampled_buffers.data(), resampled_buffers.size());
+    }
+
     speech_len = (resampled_buffers.size()) / 2;
     speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
     if (speech_buff)
@@ -762,6 +785,55 @@
         return false;
 }
 
+bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
+{
+    if (speech_data != NULL) {
+        free(speech_data);
+    }
+    if (speech_buff != NULL) {
+        free(speech_buff);
+    }
+    if (speech_char != NULL) {
+        free(speech_char);
+    }
+
+    speech_len = n_buf_len / 2;
+    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
+    if (speech_buff)
+    {
+        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
+        memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
+
+        speech_data = (float*)malloc(sizeof(float) * speech_len);
+        memset(speech_data, 0, sizeof(float) * speech_len);
+
+        float scale = 1;
+        if (data_type == 1) {
+            scale = 32768;
+        }
+
+        for (int32_t i = 0; i != speech_len; ++i) {
+            speech_data[i] = (float)speech_buff[i] / scale;
+        }
+        
+        //resample
+        if(*sampling_rate != MODEL_SAMPLE_RATE){
+            WavResample(*sampling_rate, speech_data, speech_len);
+        }
+
+        for (int32_t i = 0; i != speech_len; ++i) {
+            all_samples.emplace_back(speech_data[i]);
+        }
+
+        AudioFrame* frame = new AudioFrame(speech_len);
+        frame_queue.push(frame);
+        return true;
+
+    }
+    else
+        return false;
+}
+
 bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
 {
     if (speech_data != NULL) {
@@ -870,24 +942,25 @@
     return true;
 }
 
-int Audio::FetchChunck(float *&dout, int len)
+int Audio::FetchTpass(AudioFrame *&frame)
 {
-    if (offset >= speech_align_len) {
-        dout = NULL;
-        return S_ERR;
-    } else if (offset == speech_align_len - len) {
-        dout = speech_data + offset;
-        offset = speech_align_len;
-        // 涓存椂瑙e喅 
-        AudioFrame *frame = frame_queue.front();
-        frame_queue.pop();
-        delete frame;
-
-        return S_END;
+    if (asr_offline_queue.size() > 0) {
+        frame = asr_offline_queue.front();
+        asr_offline_queue.pop();
+        return 1;
     } else {
-        dout = speech_data + offset;
-        offset += len;
-        return S_MIDDLE;
+        return 0;
+    }
+}
+
+int Audio::FetchChunck(AudioFrame *&frame)
+{
+    if (asr_online_queue.size() > 0) {
+        frame = asr_online_queue.front();
+        asr_online_queue.pop();
+        return 1;
+    } else {
+        return 0;
     }
 }
 
@@ -956,7 +1029,6 @@
 
     std::vector<float> pcm_data(speech_data, speech_data+sp_len);
     vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
-    int seg_sample = MODEL_SAMPLE_RATE/1000;
     for(vector<int> segment:vad_segments)
     {
         frame = new AudioFrame();
@@ -968,7 +1040,6 @@
         frame = NULL;
     }
 }
-
 
 void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
 {
@@ -984,4 +1055,161 @@
     vad_segments = vad_obj->Infer(pcm_data, input_finished);
 }
 
+// 2pass
+void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYPE asr_mode)
+{
+    AudioFrame *frame;
+
+    frame = frame_queue.front();
+    frame_queue.pop();
+    int sp_len = frame->GetLen();
+    delete frame;
+    frame = NULL;
+
+    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+    vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
+
+    speech_end += sp_len/seg_sample;
+    if(vad_segments.size() == 0){
+        if(speech_start != -1){
+            int start = speech_start*seg_sample;
+            int end = speech_end*seg_sample;
+            int buff_len = end-start;
+            int step = chunk_len;
+
+            if(asr_mode != ASR_OFFLINE){
+                if(buff_len >= step){
+                    frame = new AudioFrame(step);
+                    frame->data = (float*)malloc(sizeof(float) * step);
+                    memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+                    asr_online_queue.push(frame);
+                    frame = NULL;
+                    speech_start += step/seg_sample;
+                }
+            }
+        }
+    }else{
+        for(auto vad_segment: vad_segments){
+            int speech_start_i=-1, speech_end_i=-1;
+            if(vad_segment[0] != -1){
+                speech_start_i = vad_segment[0];
+            }
+            if(vad_segment[1] != -1){
+                speech_end_i = vad_segment[1];
+            }
+
+            // [1, 100]
+            if(speech_start_i != -1 && speech_end_i != -1){
+                int start = speech_start_i*seg_sample;
+                int end = speech_end_i*seg_sample;
+
+                if(asr_mode != ASR_OFFLINE){
+                    frame = new AudioFrame(end-start);
+                    frame->is_final = true;
+                    frame->data = (float*)malloc(sizeof(float) * (end-start));
+                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+                    asr_online_queue.push(frame);
+                    frame = NULL;
+                }
+
+                if(asr_mode != ASR_ONLINE){
+                    frame = new AudioFrame(end-start);
+                    frame->is_final = true;
+                    frame->data = (float*)malloc(sizeof(float) * (end-start));
+                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+                    asr_offline_queue.push(frame);
+                    frame = NULL;
+                }
+
+                speech_start = -1;
+                speech_offline_start = -1;
+            // [70, -1]
+            }else if(speech_start_i != -1){
+                speech_start = speech_start_i;
+                speech_offline_start = speech_start_i;
+                
+                int start = speech_start*seg_sample;
+                int end = speech_end*seg_sample;
+                int buff_len = end-start;
+                int step = chunk_len;
+
+                if(asr_mode != ASR_OFFLINE){
+                    if(buff_len >= step){
+                        frame = new AudioFrame(step);
+                        frame->data = (float*)malloc(sizeof(float) * step);
+                        memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+                        asr_online_queue.push(frame);
+                        frame = NULL;
+                        speech_start += step/seg_sample;
+                    }
+                }
+
+            }else if(speech_end_i != -1){ // [-1,100]
+                if(speech_start == -1 or speech_offline_start == -1){
+                    LOG(ERROR) <<"Vad start is null while vad end is available." ;
+                    exit(-1);
+                }
+
+                int start = speech_start*seg_sample;
+                int offline_start = speech_offline_start*seg_sample;
+                int end = speech_end_i*seg_sample;
+                int buff_len = end-start;
+                int step = chunk_len;
+
+                if(asr_mode != ASR_ONLINE){
+                    frame = new AudioFrame(end-offline_start);
+                    frame->is_final = true;
+                    frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
+                    memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
+                    asr_offline_queue.push(frame);
+                    frame = NULL;
+                }
+
+                if(asr_mode != ASR_OFFLINE){
+                    if(buff_len > 0){
+                        for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
+                            bool is_final = false;
+                            if (sample_offset + step >= buff_len - 1) {
+                                step = buff_len - sample_offset;
+                                is_final = true;
+                            }
+                            frame = new AudioFrame(step);
+                            frame->is_final = is_final;
+                            frame->data = (float*)malloc(sizeof(float) * step);
+                            memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
+                            asr_online_queue.push(frame);
+                            frame = NULL;
+                        }
+                    }else{
+                        frame = new AudioFrame(0);
+                        frame->is_final = true;
+                        asr_online_queue.push(frame);
+                        frame = NULL;
+                    }
+                }
+                speech_start = -1;
+                speech_offline_start = -1;
+            }
+        }
+    }
+
+    // erase all_samples
+    int vector_cache = MODEL_SAMPLE_RATE*2;
+    if(speech_offline_start == -1){
+        if(all_samples.size() > vector_cache){
+            int erase_num = all_samples.size() - vector_cache;
+            all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+            offset += erase_num;
+        }
+    }else{
+        int offline_start = speech_offline_start*seg_sample;
+         if(offline_start-offset > vector_cache){
+            int erase_num = offline_start-offset - vector_cache;
+            all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+            offset += erase_num;
+        }       
+    }
+    
+}
+
 } // namespace funasr
\ No newline at end of file

--
Gitblit v1.9.1