From b454a1054fadbff0ee963944ff42f66b98317582 Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期二, 08 八月 2023 11:17:43 +0800
Subject: [PATCH] update online runtime, including vad-online, paraformer-online, punc-online,2pass (#815)
---
funasr/runtime/onnxruntime/src/audio.cpp | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 255 insertions(+), 27 deletions(-)
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 85633b7..2ba9c30 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -132,40 +132,54 @@
};
};
-AudioFrame::AudioFrame(){};
+AudioFrame::AudioFrame(){}
AudioFrame::AudioFrame(int len) : len(len)
{
start = 0;
-};
-AudioFrame::~AudioFrame(){};
+}
+AudioFrame::AudioFrame(const AudioFrame &other)
+{
+ start = other.start;
+ end = other.end;
+ len = other.len;
+ is_final = other.is_final;
+}
+AudioFrame::AudioFrame(int start, int end, bool is_final):start(start),end(end),is_final(is_final){
+ len = end - start;
+}
+AudioFrame::~AudioFrame(){
+ if(data != NULL){
+ free(data);
+ }
+}
int AudioFrame::SetStart(int val)
{
start = val < 0 ? 0 : val;
return start;
-};
+}
int AudioFrame::SetEnd(int val)
{
end = val;
len = end - start;
return end;
-};
+}
int AudioFrame::GetStart()
{
return start;
-};
+}
int AudioFrame::GetLen()
{
return len;
-};
+}
int AudioFrame::Disp()
{
LOG(ERROR) << "Not imp!!!!";
return 0;
-};
+}
Audio::Audio(int data_type) : data_type(data_type)
{
@@ -230,7 +244,7 @@
copy(samples.begin(), samples.end(), speech_data);
}
-bool Audio::FfmpegLoad(const char *filename){
+bool Audio::FfmpegLoad(const char *filename, bool copy2char){
// from file
AVFormatContext* formatContext = avformat_alloc_context();
if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
@@ -353,8 +367,17 @@
if (speech_buff != NULL) {
free(speech_buff);
}
+ if (speech_char != NULL) {
+ free(speech_char);
+ }
offset = 0;
+ if(copy2char){
+ speech_char = (char *)malloc(resampled_buffers.size());
+ memset(speech_char, 0, resampled_buffers.size());
+ memcpy((void*)speech_char, (const void*)resampled_buffers.data(), resampled_buffers.size());
+ }
+
speech_len = (resampled_buffers.size()) / 2;
speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
@@ -762,6 +785,55 @@
return false;
}
+bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
+{
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+ if (speech_char != NULL) {
+ free(speech_char);
+ }
+
+ speech_len = n_buf_len / 2;
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
+ memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
+
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
+
+ float scale = 1;
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (int32_t i = 0; i != speech_len; ++i) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ //resample
+ if(*sampling_rate != MODEL_SAMPLE_RATE){
+ WavResample(*sampling_rate, speech_data, speech_len);
+ }
+
+ for (int32_t i = 0; i != speech_len; ++i) {
+ all_samples.emplace_back(speech_data[i]);
+ }
+
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
+ return true;
+
+ }
+ else
+ return false;
+}
+
bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
{
if (speech_data != NULL) {
@@ -870,24 +942,25 @@
return true;
}
-int Audio::FetchChunck(float *&dout, int len)
+int Audio::FetchTpass(AudioFrame *&frame)
{
- if (offset >= speech_align_len) {
- dout = NULL;
- return S_ERR;
- } else if (offset == speech_align_len - len) {
- dout = speech_data + offset;
- offset = speech_align_len;
- // 涓存椂瑙e喅
- AudioFrame *frame = frame_queue.front();
- frame_queue.pop();
- delete frame;
-
- return S_END;
+ if (asr_offline_queue.size() > 0) {
+ frame = asr_offline_queue.front();
+ asr_offline_queue.pop();
+ return 1;
} else {
- dout = speech_data + offset;
- offset += len;
- return S_MIDDLE;
+ return 0;
+ }
+}
+
+int Audio::FetchChunck(AudioFrame *&frame)
+{
+ if (asr_online_queue.size() > 0) {
+ frame = asr_online_queue.front();
+ asr_online_queue.pop();
+ return 1;
+ } else {
+ return 0;
}
}
@@ -956,7 +1029,6 @@
std::vector<float> pcm_data(speech_data, speech_data+sp_len);
vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
- int seg_sample = MODEL_SAMPLE_RATE/1000;
for(vector<int> segment:vad_segments)
{
frame = new AudioFrame();
@@ -968,7 +1040,6 @@
frame = NULL;
}
}
-
void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
{
@@ -984,4 +1055,161 @@
vad_segments = vad_obj->Infer(pcm_data, input_finished);
}
+// 2pass
+void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYPE asr_mode)
+{
+ AudioFrame *frame;
+
+ frame = frame_queue.front();
+ frame_queue.pop();
+ int sp_len = frame->GetLen();
+ delete frame;
+ frame = NULL;
+
+ std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+ vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
+
+ speech_end += sp_len/seg_sample;
+ if(vad_segments.size() == 0){
+ if(speech_start != -1){
+ int start = speech_start*seg_sample;
+ int end = speech_end*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len >= step){
+ frame = new AudioFrame(step);
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ speech_start += step/seg_sample;
+ }
+ }
+ }
+ }else{
+ for(auto vad_segment: vad_segments){
+ int speech_start_i=-1, speech_end_i=-1;
+ if(vad_segment[0] != -1){
+ speech_start_i = vad_segment[0];
+ }
+ if(vad_segment[1] != -1){
+ speech_end_i = vad_segment[1];
+ }
+
+ // [1, 100]
+ if(speech_start_i != -1 && speech_end_i != -1){
+ int start = speech_start_i*seg_sample;
+ int end = speech_end_i*seg_sample;
+
+ if(asr_mode != ASR_OFFLINE){
+ frame = new AudioFrame(end-start);
+ frame->is_final = true;
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+
+ if(asr_mode != ASR_ONLINE){
+ frame = new AudioFrame(end-start);
+ frame->is_final = true;
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+ asr_offline_queue.push(frame);
+ frame = NULL;
+ }
+
+ speech_start = -1;
+ speech_offline_start = -1;
+ // [70, -1]
+ }else if(speech_start_i != -1){
+ speech_start = speech_start_i;
+ speech_offline_start = speech_start_i;
+
+ int start = speech_start*seg_sample;
+ int end = speech_end*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len >= step){
+ frame = new AudioFrame(step);
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ speech_start += step/seg_sample;
+ }
+ }
+
+ }else if(speech_end_i != -1){ // [-1,100]
+ if(speech_start == -1 or speech_offline_start == -1){
+ LOG(ERROR) <<"Vad start is null while vad end is available." ;
+ exit(-1);
+ }
+
+ int start = speech_start*seg_sample;
+ int offline_start = speech_offline_start*seg_sample;
+ int end = speech_end_i*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_ONLINE){
+ frame = new AudioFrame(end-offline_start);
+ frame->is_final = true;
+ frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
+ memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
+ asr_offline_queue.push(frame);
+ frame = NULL;
+ }
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len > 0){
+ for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
+ bool is_final = false;
+ if (sample_offset + step >= buff_len - 1) {
+ step = buff_len - sample_offset;
+ is_final = true;
+ }
+ frame = new AudioFrame(step);
+ frame->is_final = is_final;
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+ }else{
+ frame = new AudioFrame(0);
+ frame->is_final = true;
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+ }
+ speech_start = -1;
+ speech_offline_start = -1;
+ }
+ }
+ }
+
+ // erase all_samples
+ int vector_cache = MODEL_SAMPLE_RATE*2;
+ if(speech_offline_start == -1){
+ if(all_samples.size() > vector_cache){
+ int erase_num = all_samples.size() - vector_cache;
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+ offset += erase_num;
+ }
+ }else{
+ int offline_start = speech_offline_start*seg_sample;
+ if(offline_start-offset > vector_cache){
+ int erase_num = offline_start-offset - vector_cache;
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+ offset += erase_num;
+ }
+ }
+
+}
+
} // namespace funasr
\ No newline at end of file
--
Gitblit v1.9.1