Yabin Li
2023-08-08 b454a1054fadbff0ee963944ff42f66b98317582
funasr/runtime/onnxruntime/src/audio.cpp
@@ -132,40 +132,54 @@
    };
};
AudioFrame::AudioFrame(){};
AudioFrame::AudioFrame(){}
AudioFrame::AudioFrame(int len) : len(len)
{
    start = 0;
};
AudioFrame::~AudioFrame(){};
}
AudioFrame::AudioFrame(const AudioFrame &other)
{
    start = other.start;
    end = other.end;
    len = other.len;
    is_final = other.is_final;
}
AudioFrame::AudioFrame(int start, int end, bool is_final):start(start),end(end),is_final(is_final){
    len = end - start;
}
AudioFrame::~AudioFrame(){
    if(data != NULL){
        free(data);
    }
}
int AudioFrame::SetStart(int val)
{
    start = val < 0 ? 0 : val;
    return start;
};
}
int AudioFrame::SetEnd(int val)
{
    end = val;
    len = end - start;
    return end;
};
}
int AudioFrame::GetStart()
{
    return start;
};
}
int AudioFrame::GetLen()
{
    return len;
};
}
int AudioFrame::Disp()
{
    LOG(ERROR) << "Not imp!!!!";
    return 0;
};
}
Audio::Audio(int data_type) : data_type(data_type)
{
@@ -230,7 +244,7 @@
    copy(samples.begin(), samples.end(), speech_data);
}
bool Audio::FfmpegLoad(const char *filename){
bool Audio::FfmpegLoad(const char *filename, bool copy2char){
    // from file
    AVFormatContext* formatContext = avformat_alloc_context();
    if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
@@ -353,8 +367,17 @@
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    if (speech_char != NULL) {
        free(speech_char);
    }
    offset = 0;
    
    if(copy2char){
        speech_char = (char *)malloc(resampled_buffers.size());
        memset(speech_char, 0, resampled_buffers.size());
        memcpy((void*)speech_char, (const void*)resampled_buffers.data(), resampled_buffers.size());
    }
    speech_len = (resampled_buffers.size()) / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
@@ -762,6 +785,55 @@
        return false;
}
bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
{
    if (speech_data != NULL) {
        free(speech_data);
    }
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    if (speech_char != NULL) {
        free(speech_char);
    }
    speech_len = n_buf_len / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
    {
        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
        memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
        speech_data = (float*)malloc(sizeof(float) * speech_len);
        memset(speech_data, 0, sizeof(float) * speech_len);
        float scale = 1;
        if (data_type == 1) {
            scale = 32768;
        }
        for (int32_t i = 0; i != speech_len; ++i) {
            speech_data[i] = (float)speech_buff[i] / scale;
        }
        //resample
        if(*sampling_rate != MODEL_SAMPLE_RATE){
            WavResample(*sampling_rate, speech_data, speech_len);
        }
        for (int32_t i = 0; i != speech_len; ++i) {
            all_samples.emplace_back(speech_data[i]);
        }
        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
        return true;
    }
    else
        return false;
}
bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
{
    if (speech_data != NULL) {
@@ -870,24 +942,25 @@
    return true;
}
int Audio::FetchChunck(float *&dout, int len)
int Audio::FetchTpass(AudioFrame *&frame)
{
    if (offset >= speech_align_len) {
        dout = NULL;
        return S_ERR;
    } else if (offset == speech_align_len - len) {
        dout = speech_data + offset;
        offset = speech_align_len;
        // 临时解决
        AudioFrame *frame = frame_queue.front();
        frame_queue.pop();
        delete frame;
        return S_END;
    if (asr_offline_queue.size() > 0) {
        frame = asr_offline_queue.front();
        asr_offline_queue.pop();
        return 1;
    } else {
        dout = speech_data + offset;
        offset += len;
        return S_MIDDLE;
        return 0;
    }
}
int Audio::FetchChunck(AudioFrame *&frame)
{
    if (asr_online_queue.size() > 0) {
        frame = asr_online_queue.front();
        asr_online_queue.pop();
        return 1;
    } else {
        return 0;
    }
}
@@ -956,7 +1029,6 @@
    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
    vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
    int seg_sample = MODEL_SAMPLE_RATE/1000;
    for(vector<int> segment:vad_segments)
    {
        frame = new AudioFrame();
@@ -968,7 +1040,6 @@
        frame = NULL;
    }
}
void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
{
@@ -984,4 +1055,161 @@
    vad_segments = vad_obj->Infer(pcm_data, input_finished);
}
// 2pass
void Audio::Split(VadModel* vad_obj, int chunk_len, bool input_finished, ASR_TYPE asr_mode)
{
    AudioFrame *frame;
    frame = frame_queue.front();
    frame_queue.pop();
    int sp_len = frame->GetLen();
    delete frame;
    frame = NULL;
    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
    vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
    speech_end += sp_len/seg_sample;
    if(vad_segments.size() == 0){
        if(speech_start != -1){
            int start = speech_start*seg_sample;
            int end = speech_end*seg_sample;
            int buff_len = end-start;
            int step = chunk_len;
            if(asr_mode != ASR_OFFLINE){
                if(buff_len >= step){
                    frame = new AudioFrame(step);
                    frame->data = (float*)malloc(sizeof(float) * step);
                    memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                    asr_online_queue.push(frame);
                    frame = NULL;
                    speech_start += step/seg_sample;
                }
            }
        }
    }else{
        for(auto vad_segment: vad_segments){
            int speech_start_i=-1, speech_end_i=-1;
            if(vad_segment[0] != -1){
                speech_start_i = vad_segment[0];
            }
            if(vad_segment[1] != -1){
                speech_end_i = vad_segment[1];
            }
            // [1, 100]
            if(speech_start_i != -1 && speech_end_i != -1){
                int start = speech_start_i*seg_sample;
                int end = speech_end_i*seg_sample;
                if(asr_mode != ASR_OFFLINE){
                    frame = new AudioFrame(end-start);
                    frame->is_final = true;
                    frame->data = (float*)malloc(sizeof(float) * (end-start));
                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                    asr_online_queue.push(frame);
                    frame = NULL;
                }
                if(asr_mode != ASR_ONLINE){
                    frame = new AudioFrame(end-start);
                    frame->is_final = true;
                    frame->data = (float*)malloc(sizeof(float) * (end-start));
                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                    asr_offline_queue.push(frame);
                    frame = NULL;
                }
                speech_start = -1;
                speech_offline_start = -1;
            // [70, -1]
            }else if(speech_start_i != -1){
                speech_start = speech_start_i;
                speech_offline_start = speech_start_i;
                int start = speech_start*seg_sample;
                int end = speech_end*seg_sample;
                int buff_len = end-start;
                int step = chunk_len;
                if(asr_mode != ASR_OFFLINE){
                    if(buff_len >= step){
                        frame = new AudioFrame(step);
                        frame->data = (float*)malloc(sizeof(float) * step);
                        memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                        asr_online_queue.push(frame);
                        frame = NULL;
                        speech_start += step/seg_sample;
                    }
                }
            }else if(speech_end_i != -1){ // [-1,100]
                if(speech_start == -1 or speech_offline_start == -1){
                    LOG(ERROR) <<"Vad start is null while vad end is available." ;
                    exit(-1);
                }
                int start = speech_start*seg_sample;
                int offline_start = speech_offline_start*seg_sample;
                int end = speech_end_i*seg_sample;
                int buff_len = end-start;
                int step = chunk_len;
                if(asr_mode != ASR_ONLINE){
                    frame = new AudioFrame(end-offline_start);
                    frame->is_final = true;
                    frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
                    memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
                    asr_offline_queue.push(frame);
                    frame = NULL;
                }
                if(asr_mode != ASR_OFFLINE){
                    if(buff_len > 0){
                        for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
                            bool is_final = false;
                            if (sample_offset + step >= buff_len - 1) {
                                step = buff_len - sample_offset;
                                is_final = true;
                            }
                            frame = new AudioFrame(step);
                            frame->is_final = is_final;
                            frame->data = (float*)malloc(sizeof(float) * step);
                            memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
                            asr_online_queue.push(frame);
                            frame = NULL;
                        }
                    }else{
                        frame = new AudioFrame(0);
                        frame->is_final = true;
                        asr_online_queue.push(frame);
                        frame = NULL;
                    }
                }
                speech_start = -1;
                speech_offline_start = -1;
            }
        }
    }
    // erase all_samples
    int vector_cache = MODEL_SAMPLE_RATE*2;
    if(speech_offline_start == -1){
        if(all_samples.size() > vector_cache){
            int erase_num = all_samples.size() - vector_cache;
            all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
            offset += erase_num;
        }
    }else{
        int offline_start = speech_offline_start*seg_sample;
         if(offline_start-offset > vector_cache){
            int erase_num = offline_start-offset - vector_cache;
            all_samples.erase(all_samples.begin(), all_samples.begin()+erase_num);
            offset += erase_num;
        }
    }
}
} // namespace funasr