From b9bcf1f093c3053fdc4e2cf4a1d38e27bbf429fb Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 19 十月 2023 14:03:48 +0800
Subject: [PATCH] docs

---
 funasr/runtime/onnxruntime/src/audio.cpp |   81 +++++++++++++++++++++++++++++-----------
 1 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index a882078..76a01f9 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -9,6 +9,14 @@
 #include "audio.h"
 #include "precomp.h"
 
+#ifdef _MSC_VER
+#pragma warning(disable:4996)
+#endif
+
+#if defined(__APPLE__)
+#include <string.h>
+#else
+
 extern "C" {
 #include <libavutil/opt.h>
 #include <libavcodec/avcodec.h>
@@ -17,6 +25,10 @@
 #include <libavutil/samplefmt.h>
 #include <libswresample/swresample.h>
 }
+
+#endif
+
+
 
 using namespace std;
 
@@ -245,17 +257,20 @@
 }
 
 bool Audio::FfmpegLoad(const char *filename, bool copy2char){
+#if defined(__APPLE__)
+    return false;
+#else
     // from file
     AVFormatContext* formatContext = avformat_alloc_context();
     if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
-        printf("Error: Could not open input file.");
+        LOG(ERROR) << "Error: Could not open input file.";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         return false;
     }
 
     if (avformat_find_stream_info(formatContext, NULL) < 0) {
-        printf("Error: Could not find stream information.");
+        LOG(ERROR) << "Error: Could not open input file.";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         return false;
@@ -265,23 +280,28 @@
     int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
     if (audioStreamIndex >= 0) {
         codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
+    }else {
+        LOG(ERROR) << "Error: Could not open input file.";
+        avformat_close_input(&formatContext);
+        avformat_free_context(formatContext);
+        return false;        
     }
     AVCodecContext* codecContext = avcodec_alloc_context3(codec);
     if (!codecContext) {
-        fprintf(stderr, "Failed to allocate codec context\n");
+        LOG(ERROR) << "Failed to allocate codec context";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         return false;
     }
     if (avcodec_parameters_to_context(codecContext, codecParameters) != 0) {
-        printf("Error: Could not copy codec parameters to codec context.");
+        LOG(ERROR) << "Error: Could not copy codec parameters to codec context.";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         avcodec_free_context(&codecContext);
         return false;
     }
     if (avcodec_open2(codecContext, codec, NULL) < 0) {
-        printf("Error: Could not open audio decoder.");
+        LOG(ERROR) << "Error: Could not open audio decoder.";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         avcodec_free_context(&codecContext);
@@ -299,14 +319,14 @@
         nullptr // parent context
     );
     if (swr_ctx == nullptr) {
-        std::cerr << "Could not initialize resampler" << std::endl;
+        LOG(ERROR) << "Could not initialize resampler";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         avcodec_free_context(&codecContext);
         return false;
     }
     if (swr_init(swr_ctx) != 0) {
-        std::cerr << "Could not initialize resampler" << std::endl;
+        LOG(ERROR) << "Could not initialize resampler";
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         avcodec_free_context(&codecContext);
@@ -344,7 +364,7 @@
                         in_samples // input buffer size
                     );
                     if (ret < 0) {
-                        std::cerr << "Error resampling audio" << std::endl;
+                        LOG(ERROR) << "Error resampling audio";
                         break;
                     }
                     std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
@@ -403,12 +423,15 @@
     }
     else
         return false;
-    
+#endif
 }
 
 bool Audio::FfmpegLoad(const char* buf, int n_file_len){
+#if defined(__APPLE__)
+    return false;
+#else
     // from buf
-    char* buf_copy = (char *)malloc(n_file_len);
+    void* buf_copy = av_malloc(n_file_len);
     memcpy(buf_copy, buf, n_file_len);
 
     AVIOContext* avio_ctx = avio_alloc_context(
@@ -423,7 +446,7 @@
     AVFormatContext* formatContext = avformat_alloc_context();
     formatContext->pb = avio_ctx;
     if (avformat_open_input(&formatContext, "", NULL, NULL) != 0) {
-        printf("Error: Could not open input file.");
+        LOG(ERROR) << "Error: Could not open input file.";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -431,7 +454,7 @@
     }
 
     if (avformat_find_stream_info(formatContext, NULL) < 0) {
-        printf("Error: Could not find stream information.");
+        LOG(ERROR) << "Error: Could not find stream information.";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -445,14 +468,14 @@
     }
     AVCodecContext* codecContext = avcodec_alloc_context3(codec);
     if (!codecContext) {
-        fprintf(stderr, "Failed to allocate codec context\n");
+        LOG(ERROR) << "Failed to allocate codec context";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
         return false;
     }
     if (avcodec_parameters_to_context(codecContext, codecParameters) != 0) {
-        printf("Error: Could not copy codec parameters to codec context.");
+        LOG(ERROR) << "Error: Could not copy codec parameters to codec context.";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -460,7 +483,7 @@
         return false;
     }
     if (avcodec_open2(codecContext, codec, NULL) < 0) {
-        printf("Error: Could not open audio decoder.");
+        LOG(ERROR) << "Error: Could not open audio decoder.";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -479,7 +502,7 @@
         nullptr // parent context
     );
     if (swr_ctx == nullptr) {
-        std::cerr << "Could not initialize resampler" << std::endl;
+        LOG(ERROR) << "Could not initialize resampler";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -487,7 +510,7 @@
         return false;
     }
     if (swr_init(swr_ctx) != 0) {
-        std::cerr << "Could not initialize resampler" << std::endl;
+        LOG(ERROR) << "Could not initialize resampler";
         avio_context_free(&avio_ctx);
         avformat_close_input(&formatContext);
         avformat_free_context(formatContext);
@@ -526,7 +549,7 @@
                         in_samples // input buffer size
                     );
                     if (ret < 0) {
-                        std::cerr << "Error resampling audio" << std::endl;
+                        LOG(ERROR) << "Error resampling audio";
                         break;
                     }
                     std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
@@ -577,7 +600,7 @@
     }
     else
         return false;
-    
+#endif
 }
 
 
@@ -1097,6 +1120,8 @@
             if(asr_mode != ASR_OFFLINE){
                 if(buff_len >= step){
                     frame = new AudioFrame(step);
+                    frame->global_start = speech_start;
+                    frame->global_end = speech_start + step/seg_sample;
                     frame->data = (float*)malloc(sizeof(float) * step);
                     memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                     asr_online_queue.push(frame);
@@ -1123,6 +1148,8 @@
                 if(asr_mode != ASR_OFFLINE){
                     frame = new AudioFrame(end-start);
                     frame->is_final = true;
+                    frame->global_start = speech_start_i;
+                    frame->global_end = speech_end_i;
                     frame->data = (float*)malloc(sizeof(float) * (end-start));
                     memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                     asr_online_queue.push(frame);
@@ -1132,6 +1159,8 @@
                 if(asr_mode != ASR_ONLINE){
                     frame = new AudioFrame(end-start);
                     frame->is_final = true;
+                    frame->global_start = speech_start_i;
+                    frame->global_end = speech_end_i;
                     frame->data = (float*)malloc(sizeof(float) * (end-start));
                     memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                     asr_offline_queue.push(frame);
@@ -1153,6 +1182,8 @@
                 if(asr_mode != ASR_OFFLINE){
                     if(buff_len >= step){
                         frame = new AudioFrame(step);
+                        frame->global_start = speech_start;
+                        frame->global_end = speech_start + step/seg_sample;
                         frame->data = (float*)malloc(sizeof(float) * step);
                         memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                         asr_online_queue.push(frame);
@@ -1163,8 +1194,8 @@
 
             }else if(speech_end_i != -1){ // [-1,100]
                 if(speech_start == -1 or speech_offline_start == -1){
-                    LOG(ERROR) <<"Vad start is null while vad end is available." ;
-                    exit(-1);
+                    LOG(ERROR) <<"Vad start is null while vad end is available. Set vad start 0" ;
+                    speech_start = 0;
                 }
 
                 int start = speech_start*seg_sample;
@@ -1176,6 +1207,8 @@
                 if(asr_mode != ASR_ONLINE){
                     frame = new AudioFrame(end-offline_start);
                     frame->is_final = true;
+                    frame->global_start = speech_offline_start;
+                    frame->global_end = speech_end_i;
                     frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
                     memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
                     asr_offline_queue.push(frame);
@@ -1192,6 +1225,8 @@
                             }
                             frame = new AudioFrame(step);
                             frame->is_final = is_final;
+                            frame->global_start = (int)((start+sample_offset)/seg_sample);
+                            frame->global_end = frame->global_start + step/seg_sample;
                             frame->data = (float*)malloc(sizeof(float) * step);
                             memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
                             asr_online_queue.push(frame);
@@ -1200,6 +1235,8 @@
                     }else{
                         frame = new AudioFrame(0);
                         frame->is_final = true;
+                        frame->global_start = speech_start;   // in this case start >= end
+                        frame->global_end = speech_end_i;
                         asr_online_queue.push(frame);
                         frame = NULL;
                     }
@@ -1229,4 +1266,4 @@
     
 }
 
-} // namespace funasr
\ No newline at end of file
+} // namespace funasr

--
Gitblit v1.9.1