From 8dab6d184a034ca86eafa644ea0d2100aadfe27d Mon Sep 17 00:00:00 2001
From: jmwang66 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期二, 09 五月 2023 10:58:33 +0800
Subject: [PATCH] Merge pull request #473 from alibaba-damo-academy/dev_smohan
---
funasr/runtime/onnxruntime/src/audio.cpp | 65 ++++++++++++++++++++++++--------
1 files changed, 49 insertions(+), 16 deletions(-)
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index ef48fa1..6d63d67 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -11,6 +11,7 @@
using namespace std;
+namespace funasr {
// see http://soundfile.sapp.org/doc/WaveFormat/
// Note: We assume little endian here
struct WaveHeader {
@@ -153,8 +154,7 @@
int AudioFrame::Disp()
{
- printf("not imp!!!!\n");
-
+ LOG(ERROR) << "Not imp!!!!";
return 0;
};
@@ -187,8 +187,7 @@
void Audio::Disp()
{
- printf("Audio time is %f s. len is %d\n", (float)speech_len / MODEL_SAMPLE_RATE,
- speech_len);
+ LOG(INFO) << "Audio time is " << (float)speech_len / MODEL_SAMPLE_RATE << " s. len is " << speech_len;
}
float Audio::GetTimeLen()
@@ -199,19 +198,15 @@
void Audio::WavResample(int32_t sampling_rate, const float *waveform,
int32_t n)
{
- printf(
- "Creating a resampler:\n"
- " in_sample_rate: %d\n"
- " output_sample_rate: %d\n",
- sampling_rate, static_cast<int32_t>(MODEL_SAMPLE_RATE));
+ LOG(INFO) << "Creating a resampler:\n"
+ << " in_sample_rate: "<< sampling_rate << "\n"
+ << " output_sample_rate: " << static_cast<int32_t>(MODEL_SAMPLE_RATE);
float min_freq =
std::min<int32_t>(sampling_rate, MODEL_SAMPLE_RATE);
float lowpass_cutoff = 0.99 * 0.5 * min_freq;
int32_t lowpass_filter_width = 6;
- //FIXME
- //auto resampler = new LinearResample(
- // sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width);
+
auto resampler = std::make_unique<LinearResample>(
sampling_rate, MODEL_SAMPLE_RATE, lowpass_cutoff, lowpass_filter_width);
std::vector<float> samples;
@@ -240,7 +235,25 @@
std::ifstream is(filename, std::ifstream::binary);
is.read(reinterpret_cast<char *>(&header), sizeof(header));
if(!is){
- fprintf(stderr, "Failed to read %s\n", filename);
+ LOG(ERROR) << "Failed to read " << filename;
+ return false;
+ }
+
+ if (!header.Validate()) {
+ return false;
+ }
+
+ header.SeekToDataChunk(is);
+ if (!is) {
+ return false;
+ }
+
+ if (!header.Validate()) {
+ return false;
+ }
+
+ header.SeekToDataChunk(is);
+ if (!is) {
return false;
}
@@ -255,7 +268,7 @@
memset(speech_buff, 0, sizeof(int16_t) * speech_len);
is.read(reinterpret_cast<char *>(speech_buff), header.subchunk2_size);
if (!is) {
- fprintf(stderr, "Failed to read %s\n", filename);
+ LOG(ERROR) << "Failed to read " << filename;
return false;
}
speech_data = (float*)malloc(sizeof(float) * speech_len);
@@ -386,7 +399,10 @@
FILE* fp;
fp = fopen(filename, "rb");
if (fp == nullptr)
+ {
+ LOG(ERROR) << "Failed to read " << filename;
return false;
+ }
fseek(fp, 0, SEEK_END);
uint32_t n_file_len = ftell(fp);
fseek(fp, 0, SEEK_SET);
@@ -499,7 +515,7 @@
delete frame;
}
-void Audio::Split(Model* recog_obj)
+void Audio::Split(OfflineStream* offline_stream)
{
AudioFrame *frame;
@@ -510,7 +526,7 @@
frame = NULL;
std::vector<float> pcm_data(speech_data, speech_data+sp_len);
- vector<std::vector<int>> vad_segments = recog_obj->VadSeg(pcm_data);
+ vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
int seg_sample = MODEL_SAMPLE_RATE/1000;
for(vector<int> segment:vad_segments)
{
@@ -523,3 +539,20 @@
frame = NULL;
}
}
+
+
+void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments)
+{
+ AudioFrame *frame;
+
+ frame = frame_queue.front();
+ frame_queue.pop();
+ int sp_len = frame->GetLen();
+ delete frame;
+ frame = NULL;
+
+ std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+ vad_segments = vad_obj->Infer(pcm_data);
+}
+
+} // namespace funasr
\ No newline at end of file
--
Gitblit v1.9.1