From 7ab2e5cf22bbb31808bcacf84c054c710e4e6a93 Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期一, 24 四月 2023 16:19:17 +0800
Subject: [PATCH] Merge pull request #400 from alibaba-damo-academy/dev_knf

---
 funasr/runtime/onnxruntime/src/audio.cpp |  150 ++++++++++++++++----------------------------------
 1 files changed, 48 insertions(+), 102 deletions(-)

diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
similarity index 76%
rename from funasr/runtime/onnxruntime/src/Audio.cpp
rename to funasr/runtime/onnxruntime/src/audio.cpp
index 38b6de8..ef48fa1 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -6,7 +6,7 @@
 #include <fstream>
 #include <assert.h>
 
-#include "Audio.h"
+#include "audio.h"
 #include "precomp.h"
 
 using namespace std;
@@ -128,39 +128,30 @@
     start = 0;
 };
 AudioFrame::~AudioFrame(){};
-int AudioFrame::set_start(int val)
+int AudioFrame::SetStart(int val)
 {
     start = val < 0 ? 0 : val;
     return start;
 };
 
-int AudioFrame::set_end(int val, int max_len)
+int AudioFrame::SetEnd(int val)
 {
-
-    float num_samples = val - start;
-    float frame_length = 400;
-    float frame_shift = 160;
-    float num_new_samples =
-        ceil((num_samples - frame_length) / frame_shift) * frame_shift + frame_length;
-
-    end = start + num_new_samples;
-    len = (int)num_new_samples;
-    if (end > max_len)
-        printf("frame end > max_len!!!!!!!\n");
+    end = val;
+    len = end - start;
     return end;
 };
 
-int AudioFrame::get_start()
+int AudioFrame::GetStart()
 {
     return start;
 };
 
-int AudioFrame::get_len()
+int AudioFrame::GetLen()
 {
     return len;
 };
 
-int AudioFrame::disp()
+int AudioFrame::Disp()
 {
     printf("not imp!!!!\n");
 
@@ -194,27 +185,27 @@
     }
 }
 
-void Audio::disp()
+void Audio::Disp()
 {
-    printf("Audio time is %f s. len is %d\n", (float)speech_len / model_sample_rate,
+    printf("Audio time is %f s. len is %d\n", (float)speech_len / MODEL_SAMPLE_RATE,
            speech_len);
 }
 
-float Audio::get_time_len()
+float Audio::GetTimeLen()
 {
-    return (float)speech_len / model_sample_rate;
+    return (float)speech_len / MODEL_SAMPLE_RATE;
 }
 
-void Audio::wavResample(int32_t sampling_rate, const float *waveform,
+void Audio::WavResample(int32_t sampling_rate, const float *waveform,
                           int32_t n)
 {
     printf(
           "Creating a resampler:\n"
           "   in_sample_rate: %d\n"
           "   output_sample_rate: %d\n",
-          sampling_rate, static_cast<int32_t>(model_sample_rate));
+          sampling_rate, static_cast<int32_t>(MODEL_SAMPLE_RATE));
     float min_freq =
-        std::min<int32_t>(sampling_rate, model_sample_rate);
+        std::min<int32_t>(sampling_rate, MODEL_SAMPLE_RATE);
     float lowpass_cutoff = 0.99 * 0.5 * min_freq;
 
     int32_t lowpass_filter_width = 6;
@@ -222,7 +213,7 @@
     //auto resampler = new LinearResample(
     //      sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width);
     auto resampler = std::make_unique<LinearResample>(
-          sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width);
+          sampling_rate, MODEL_SAMPLE_RATE, lowpass_cutoff, lowpass_filter_width);
     std::vector<float> samples;
     resampler->Resample(waveform, n, true, &samples);
     //reset speech_data
@@ -235,7 +226,7 @@
     copy(samples.begin(), samples.end(), speech_data);
 }
 
-bool Audio::loadwav(const char *filename, int32_t* sampling_rate)
+bool Audio::LoadWav(const char *filename, int32_t* sampling_rate)
 {
     WaveHeader header;
     if (speech_data != NULL) {
@@ -279,8 +270,8 @@
         }
 
         //resample
-        if(*sampling_rate != model_sample_rate){
-            wavResample(*sampling_rate, speech_data, speech_len);
+        if(*sampling_rate != MODEL_SAMPLE_RATE){
+            WavResample(*sampling_rate, speech_data, speech_len);
         }
 
         AudioFrame* frame = new AudioFrame(speech_len);
@@ -292,7 +283,7 @@
         return false;
 }
 
-bool Audio::loadwav(const char* buf, int nFileLen, int32_t* sampling_rate)
+bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
 {
     WaveHeader header;
     if (speech_data != NULL) {
@@ -326,8 +317,8 @@
         }
         
         //resample
-        if(*sampling_rate != model_sample_rate){
-            wavResample(*sampling_rate, speech_data, speech_len);
+        if(*sampling_rate != MODEL_SAMPLE_RATE){
+            WavResample(*sampling_rate, speech_data, speech_len);
         }
 
         AudioFrame* frame = new AudioFrame(speech_len);
@@ -339,7 +330,7 @@
         return false;
 }
 
-bool Audio::loadpcmwav(const char* buf, int nBufLen, int32_t* sampling_rate)
+bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate)
 {
     if (speech_data != NULL) {
         free(speech_data);
@@ -349,7 +340,7 @@
     }
     offset = 0;
 
-    speech_len = nBufLen / 2;
+    speech_len = n_buf_len / 2;
     speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
     if (speech_buff)
     {
@@ -369,8 +360,8 @@
         }
         
         //resample
-        if(*sampling_rate != model_sample_rate){
-            wavResample(*sampling_rate, speech_data, speech_len);
+        if(*sampling_rate != MODEL_SAMPLE_RATE){
+            WavResample(*sampling_rate, speech_data, speech_len);
         }
 
         AudioFrame* frame = new AudioFrame(speech_len);
@@ -382,7 +373,7 @@
         return false;
 }
 
-bool Audio::loadpcmwav(const char* filename, int32_t* sampling_rate)
+bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
 {
     if (speech_data != NULL) {
         free(speech_data);
@@ -397,10 +388,10 @@
     if (fp == nullptr)
         return false;
     fseek(fp, 0, SEEK_END);
-    uint32_t nFileLen = ftell(fp);
+    uint32_t n_file_len = ftell(fp);
     fseek(fp, 0, SEEK_SET);
 
-    speech_len = (nFileLen) / 2;
+    speech_len = (n_file_len) / 2;
     speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
     if (speech_buff)
     {
@@ -420,8 +411,8 @@
         }
 
         //resample
-        if(*sampling_rate != model_sample_rate){
-            wavResample(*sampling_rate, speech_data, speech_len);
+        if(*sampling_rate != MODEL_SAMPLE_RATE){
+            WavResample(*sampling_rate, speech_data, speech_len);
         }
 
         AudioFrame* frame = new AudioFrame(speech_len);
@@ -434,7 +425,7 @@
 
 }
 
-int Audio::fetch_chunck(float *&dout, int len)
+int Audio::FetchChunck(float *&dout, int len)
 {
     if (offset >= speech_align_len) {
         dout = NULL;
@@ -455,14 +446,14 @@
     }
 }
 
-int Audio::fetch(float *&dout, int &len, int &flag)
+int Audio::Fetch(float *&dout, int &len, int &flag)
 {
     if (frame_queue.size() > 0) {
         AudioFrame *frame = frame_queue.front();
         frame_queue.pop();
 
-        dout = speech_data + frame->get_start();
-        len = frame->get_len();
+        dout = speech_data + frame->GetStart();
+        len = frame->GetLen();
         delete frame;
         flag = S_END;
         return 1;
@@ -471,9 +462,8 @@
     }
 }
 
-void Audio::padding()
+void Audio::Padding()
 {
-
     float num_samples = speech_len;
     float frame_length = 400;
     float frame_shift = 160;
@@ -509,71 +499,27 @@
     delete frame;
 }
 
-#define UNTRIGGERED 0
-#define TRIGGERED   1
-
-#define SPEECH_LEN_5S  (16000 * 5)
-#define SPEECH_LEN_10S (16000 * 10)
-#define SPEECH_LEN_20S (16000 * 20)
-#define SPEECH_LEN_30S (16000 * 30)
-
-/*
-void Audio::split()
+void Audio::Split(Model* recog_obj)
 {
-    VadInst *handle = WebRtcVad_Create();
-    WebRtcVad_Init(handle);
-    WebRtcVad_set_mode(handle, 2);
-    int window_size = 10;
-    AudioWindow audiowindow(window_size);
-    int status = UNTRIGGERED;
-    int offset = 0;
-    int fs = 16000;
-    int step = 480;
-
     AudioFrame *frame;
 
     frame = frame_queue.front();
     frame_queue.pop();
+    int sp_len = frame->GetLen();
     delete frame;
     frame = NULL;
 
-    while (offset < speech_len - step) {
-        int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
-        if (status == UNTRIGGERED && audiowindow.put(n) >= window_size - 1) {
-            frame = new AudioFrame();
-            int start = offset - step * (window_size - 1);
-            frame->set_start(start);
-            status = TRIGGERED;
-        } else if (status == TRIGGERED) {
-            int win_weight = audiowindow.put(n);
-            int voice_len = (offset - frame->get_start());
-            int gap = 0;
-            if (voice_len < SPEECH_LEN_5S) {
-                offset += step;
-                continue;
-            } else if (voice_len < SPEECH_LEN_10S) {
-                gap = 1;
-            } else if (voice_len < SPEECH_LEN_20S) {
-                gap = window_size / 5;
-            } else {
-                gap = window_size / 2;
-            }
-
-            if (win_weight < gap) {
-                status = UNTRIGGERED;
-                offset = frame->set_end(offset, speech_align_len);
-                frame_queue.push(frame);
-                frame = NULL;
-            }
-        }
-        offset += step;
-    }
-
-    if (frame != NULL) {
-        frame->set_end(speech_len, speech_align_len);
+    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+    vector<std::vector<int>> vad_segments = recog_obj->VadSeg(pcm_data);
+    int seg_sample = MODEL_SAMPLE_RATE/1000;
+    for(vector<int> segment:vad_segments)
+    {
+        frame = new AudioFrame();
+        int start = segment[0]*seg_sample;
+        int end = segment[1]*seg_sample;
+        frame->SetStart(start);
+        frame->SetEnd(end);
         frame_queue.push(frame);
         frame = NULL;
     }
-    WebRtcVad_Free(handle);
 }
-*/
\ No newline at end of file

--
Gitblit v1.9.1