From b9bcf1f093c3053fdc4e2cf4a1d38e27bbf429fb Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 19 十月 2023 14:03:48 +0800
Subject: [PATCH] docs
---
funasr/runtime/onnxruntime/src/audio.cpp | 376 +++++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 329 insertions(+), 47 deletions(-)
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 85633b7..76a01f9 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -9,6 +9,14 @@
#include "audio.h"
#include "precomp.h"
+#ifdef _MSC_VER
+#pragma warning(disable:4996)
+#endif
+
+#if defined(__APPLE__)
+#include <string.h>
+#else
+
extern "C" {
#include <libavutil/opt.h>
#include <libavcodec/avcodec.h>
@@ -17,6 +25,10 @@
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
+
+#endif
+
+
using namespace std;
@@ -132,40 +144,54 @@
};
};
-AudioFrame::AudioFrame(){};
+AudioFrame::AudioFrame(){}
AudioFrame::AudioFrame(int len) : len(len)
{
start = 0;
-};
-AudioFrame::~AudioFrame(){};
+}
+AudioFrame::AudioFrame(const AudioFrame &other)
+{
+ start = other.start;
+ end = other.end;
+ len = other.len;
+ is_final = other.is_final;
+}
+AudioFrame::AudioFrame(int start, int end, bool is_final):start(start),end(end),is_final(is_final){
+ len = end - start;
+}
+AudioFrame::~AudioFrame(){
+ if(data != NULL){
+ free(data);
+ }
+}
int AudioFrame::SetStart(int val)
{
start = val < 0 ? 0 : val;
return start;
-};
+}
int AudioFrame::SetEnd(int val)
{
end = val;
len = end - start;
return end;
-};
+}
int AudioFrame::GetStart()
{
return start;
-};
+}
int AudioFrame::GetLen()
{
return len;
-};
+}
int AudioFrame::Disp()
{
LOG(ERROR) << "Not imp!!!!";
return 0;
-};
+}
Audio::Audio(int data_type) : data_type(data_type)
{
@@ -230,18 +256,21 @@
copy(samples.begin(), samples.end(), speech_data);
}
-bool Audio::FfmpegLoad(const char *filename){
+bool Audio::FfmpegLoad(const char *filename, bool copy2char){
+#if defined(__APPLE__)
+ return false;
+#else
// from file
AVFormatContext* formatContext = avformat_alloc_context();
if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
- printf("Error: Could not open input file.");
+ LOG(ERROR) << "Error: Could not open input file.";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
return false;
}
if (avformat_find_stream_info(formatContext, NULL) < 0) {
- printf("Error: Could not find stream information.");
+ LOG(ERROR) << "Error: Could not open input file.";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
return false;
@@ -251,23 +280,28 @@
int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
if (audioStreamIndex >= 0) {
codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
+ }else {
+ LOG(ERROR) << "Error: Could not open input file.";
+ avformat_close_input(&formatContext);
+ avformat_free_context(formatContext);
+ return false;
}
AVCodecContext* codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
- fprintf(stderr, "Failed to allocate codec context\n");
+ LOG(ERROR) << "Failed to allocate codec context";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
return false;
}
if (avcodec_parameters_to_context(codecContext, codecParameters) != 0) {
- printf("Error: Could not copy codec parameters to codec context.");
+ LOG(ERROR) << "Error: Could not copy codec parameters to codec context.";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
avcodec_free_context(&codecContext);
return false;
}
if (avcodec_open2(codecContext, codec, NULL) < 0) {
- printf("Error: Could not open audio decoder.");
+ LOG(ERROR) << "Error: Could not open audio decoder.";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
avcodec_free_context(&codecContext);
@@ -285,14 +319,14 @@
nullptr // parent context
);
if (swr_ctx == nullptr) {
- std::cerr << "Could not initialize resampler" << std::endl;
+ LOG(ERROR) << "Could not initialize resampler";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
avcodec_free_context(&codecContext);
return false;
}
if (swr_init(swr_ctx) != 0) {
- std::cerr << "Could not initialize resampler" << std::endl;
+ LOG(ERROR) << "Could not initialize resampler";
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
avcodec_free_context(&codecContext);
@@ -330,7 +364,7 @@
in_samples // input buffer size
);
if (ret < 0) {
- std::cerr << "Error resampling audio" << std::endl;
+ LOG(ERROR) << "Error resampling audio";
break;
}
std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
@@ -353,8 +387,17 @@
if (speech_buff != NULL) {
free(speech_buff);
}
+ if (speech_char != NULL) {
+ free(speech_char);
+ }
offset = 0;
+ if(copy2char){
+ speech_char = (char *)malloc(resampled_buffers.size());
+ memset(speech_char, 0, resampled_buffers.size());
+ memcpy((void*)speech_char, (const void*)resampled_buffers.data(), resampled_buffers.size());
+ }
+
speech_len = (resampled_buffers.size()) / 2;
speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
@@ -380,12 +423,15 @@
}
else
return false;
-
+#endif
}
bool Audio::FfmpegLoad(const char* buf, int n_file_len){
+#if defined(__APPLE__)
+ return false;
+#else
// from buf
- char* buf_copy = (char *)malloc(n_file_len);
+ void* buf_copy = av_malloc(n_file_len);
memcpy(buf_copy, buf, n_file_len);
AVIOContext* avio_ctx = avio_alloc_context(
@@ -400,7 +446,7 @@
AVFormatContext* formatContext = avformat_alloc_context();
formatContext->pb = avio_ctx;
if (avformat_open_input(&formatContext, "", NULL, NULL) != 0) {
- printf("Error: Could not open input file.");
+ LOG(ERROR) << "Error: Could not open input file.";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -408,7 +454,7 @@
}
if (avformat_find_stream_info(formatContext, NULL) < 0) {
- printf("Error: Could not find stream information.");
+ LOG(ERROR) << "Error: Could not find stream information.";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -422,14 +468,14 @@
}
AVCodecContext* codecContext = avcodec_alloc_context3(codec);
if (!codecContext) {
- fprintf(stderr, "Failed to allocate codec context\n");
+ LOG(ERROR) << "Failed to allocate codec context";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
return false;
}
if (avcodec_parameters_to_context(codecContext, codecParameters) != 0) {
- printf("Error: Could not copy codec parameters to codec context.");
+ LOG(ERROR) << "Error: Could not copy codec parameters to codec context.";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -437,7 +483,7 @@
return false;
}
if (avcodec_open2(codecContext, codec, NULL) < 0) {
- printf("Error: Could not open audio decoder.");
+ LOG(ERROR) << "Error: Could not open audio decoder.";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -456,7 +502,7 @@
nullptr // parent context
);
if (swr_ctx == nullptr) {
- std::cerr << "Could not initialize resampler" << std::endl;
+ LOG(ERROR) << "Could not initialize resampler";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -464,7 +510,7 @@
return false;
}
if (swr_init(swr_ctx) != 0) {
- std::cerr << "Could not initialize resampler" << std::endl;
+ LOG(ERROR) << "Could not initialize resampler";
avio_context_free(&avio_ctx);
avformat_close_input(&formatContext);
avformat_free_context(formatContext);
@@ -503,7 +549,7 @@
in_samples // input buffer size
);
if (ret < 0) {
- std::cerr << "Error resampling audio" << std::endl;
+ LOG(ERROR) << "Error resampling audio";
break;
}
std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
@@ -554,7 +600,7 @@
}
else
return false;
-
+#endif
}
@@ -762,6 +808,55 @@
return false;
}
+bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
+{
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ if (speech_buff != NULL) {
+ free(speech_buff);
+ }
+ if (speech_char != NULL) {
+ free(speech_char);
+ }
+
+ speech_len = n_buf_len / 2;
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
+ if (speech_buff)
+ {
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
+ memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
+
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
+
+ float scale = 1;
+ if (data_type == 1) {
+ scale = 32768;
+ }
+
+ for (int32_t i = 0; i != speech_len; ++i) {
+ speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ //resample
+ if(*sampling_rate != MODEL_SAMPLE_RATE){
+ WavResample(*sampling_rate, speech_data, speech_len);
+ }
+
+ for (int32_t i = 0; i != speech_len; ++i) {
+ all_samples.emplace_back(speech_data[i]);
+ }
+
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
+ return true;
+
+ }
+ else
+ return false;
+}
+
bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
{
if (speech_data != NULL) {
@@ -870,24 +965,25 @@
return true;
}
-int Audio::FetchChunck(float *&dout, int len)
+int Audio::FetchTpass(AudioFrame *&frame)
{
- if (offset >= speech_align_len) {
- dout = NULL;
- return S_ERR;
- } else if (offset == speech_align_len - len) {
- dout = speech_data + offset;
- offset = speech_align_len;
- // 涓存椂瑙e喅
- AudioFrame *frame = frame_queue.front();
- frame_queue.pop();
- delete frame;
-
- return S_END;
+ if (asr_offline_queue.size() > 0) {
+ frame = asr_offline_queue.front();
+ asr_offline_queue.pop();
+ return 1;
} else {
- dout = speech_data + offset;
- offset += len;
- return S_MIDDLE;
+ return 0;
+ }
+}
+
+int Audio::FetchChunck(AudioFrame *&frame)
+{
+ if (asr_online_queue.size() > 0) {
+ frame = asr_online_queue.front();
+ asr_online_queue.pop();
+ return 1;
+ } else {
+ return 0;
}
}
@@ -897,6 +993,23 @@
AudioFrame *frame = frame_queue.front();
frame_queue.pop();
+ dout = speech_data + frame->GetStart();
+ len = frame->GetLen();
+ delete frame;
+ flag = S_END;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int Audio::Fetch(float *&dout, int &len, int &flag, float &start_time)
+{
+ if (frame_queue.size() > 0) {
+ AudioFrame *frame = frame_queue.front();
+ frame_queue.pop();
+
+ start_time = (float)(frame->GetStart())/MODEL_SAMPLE_RATE;
dout = speech_data + frame->GetStart();
len = frame->GetLen();
delete frame;
@@ -956,7 +1069,6 @@
std::vector<float> pcm_data(speech_data, speech_data+sp_len);
vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
- int seg_sample = MODEL_SAMPLE_RATE/1000;
for(vector<int> segment:vad_segments)
{
frame = new AudioFrame();
@@ -968,7 +1080,6 @@
frame = NULL;
}
}
-
void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
{
@@ -984,4 +1095,175 @@
vad_segments = vad_obj->Infer(pcm_data, input_finished);
}
-} // namespace funasr
\ No newline at end of file
+// 2pass
+void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYPE asr_mode)
+{
+ AudioFrame *frame;
+
+ frame = frame_queue.front();
+ frame_queue.pop();
+ int sp_len = frame->GetLen();
+ delete frame;
+ frame = NULL;
+
+ std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+ vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
+
+ speech_end += sp_len/seg_sample;
+ if(vad_segments.size() == 0){
+ if(speech_start != -1){
+ int start = speech_start*seg_sample;
+ int end = speech_end*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len >= step){
+ frame = new AudioFrame(step);
+ frame->global_start = speech_start;
+ frame->global_end = speech_start + step/seg_sample;
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ speech_start += step/seg_sample;
+ }
+ }
+ }
+ }else{
+ for(auto vad_segment: vad_segments){
+ int speech_start_i=-1, speech_end_i=-1;
+ if(vad_segment[0] != -1){
+ speech_start_i = vad_segment[0];
+ }
+ if(vad_segment[1] != -1){
+ speech_end_i = vad_segment[1];
+ }
+
+ // [1, 100]
+ if(speech_start_i != -1 && speech_end_i != -1){
+ int start = speech_start_i*seg_sample;
+ int end = speech_end_i*seg_sample;
+
+ if(asr_mode != ASR_OFFLINE){
+ frame = new AudioFrame(end-start);
+ frame->is_final = true;
+ frame->global_start = speech_start_i;
+ frame->global_end = speech_end_i;
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+
+ if(asr_mode != ASR_ONLINE){
+ frame = new AudioFrame(end-start);
+ frame->is_final = true;
+ frame->global_start = speech_start_i;
+ frame->global_end = speech_end_i;
+ frame->data = (float*)malloc(sizeof(float) * (end-start));
+ memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
+ asr_offline_queue.push(frame);
+ frame = NULL;
+ }
+
+ speech_start = -1;
+ speech_offline_start = -1;
+ // [70, -1]
+ }else if(speech_start_i != -1){
+ speech_start = speech_start_i;
+ speech_offline_start = speech_start_i;
+
+ int start = speech_start*seg_sample;
+ int end = speech_end*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len >= step){
+ frame = new AudioFrame(step);
+ frame->global_start = speech_start;
+ frame->global_end = speech_start + step/seg_sample;
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ speech_start += step/seg_sample;
+ }
+ }
+
+ }else if(speech_end_i != -1){ // [-1,100]
+ if(speech_start == -1 or speech_offline_start == -1){
+ LOG(ERROR) <<"Vad start is null while vad end is available. Set vad start 0" ;
+ speech_start = 0;
+ }
+
+ int start = speech_start*seg_sample;
+ int offline_start = speech_offline_start*seg_sample;
+ int end = speech_end_i*seg_sample;
+ int buff_len = end-start;
+ int step = chunk_len;
+
+ if(asr_mode != ASR_ONLINE){
+ frame = new AudioFrame(end-offline_start);
+ frame->is_final = true;
+ frame->global_start = speech_offline_start;
+ frame->global_end = speech_end_i;
+ frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
+ memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
+ asr_offline_queue.push(frame);
+ frame = NULL;
+ }
+
+ if(asr_mode != ASR_OFFLINE){
+ if(buff_len > 0){
+ for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
+ bool is_final = false;
+ if (sample_offset + step >= buff_len - 1) {
+ step = buff_len - sample_offset;
+ is_final = true;
+ }
+ frame = new AudioFrame(step);
+ frame->is_final = is_final;
+ frame->global_start = (int)((start+sample_offset)/seg_sample);
+ frame->global_end = frame->global_start + step/seg_sample;
+ frame->data = (float*)malloc(sizeof(float) * step);
+ memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+ }else{
+ frame = new AudioFrame(0);
+ frame->is_final = true;
+ frame->global_start = speech_start; // in this case start >= end
+ frame->global_end = speech_end_i;
+ asr_online_queue.push(frame);
+ frame = NULL;
+ }
+ }
+ speech_start = -1;
+ speech_offline_start = -1;
+ }
+ }
+ }
+
+ // erase all_samples
+ int vector_cache = MODEL_SAMPLE_RATE*2;
+ if(speech_offline_start == -1){
+ if(all_samples.size() > vector_cache){
+ int erase_num = all_samples.size() - vector_cache;
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+ offset += erase_num;
+ }
+ }else{
+ int offline_start = speech_offline_start*seg_sample;
+ if(offline_start-offset > vector_cache){
+ int erase_num = offline_start-offset - vector_cache;
+ all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
+ offset += erase_num;
+ }
+ }
+
+}
+
+} // namespace funasr
--
Gitblit v1.9.1