Merge branch 'main' of github.com:alibaba-damo-academy/FunASR
add
| | |
| | | |
| | | project(FunASRonnx) |
| | | |
| | | set(CMAKE_CXX_STANDARD 11) |
| | | # set(CMAKE_CXX_STANDARD 11) |
| | | set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") |
| | | set(CMAKE_POSITION_INDEPENDENT_CODE ON) |
| | | |
| | | include(TestBigEndian) |
| | | test_big_endian(BIG_ENDIAN) |
| | | if(BIG_ENDIAN) |
| | | message("Big endian system") |
| | | else() |
| | | message("Little endian system") |
| | | endif() |
| | | |
| | | # for onnxruntime |
| | | |
| | | IF(WIN32) |
| | | |
| | | |
| | | if(CMAKE_CL_64) |
| | | link_directories(${ONNXRUNTIME_DIR}\\lib) |
| | | else() |
| | | add_definitions(-D_WIN_X86) |
| | | endif() |
| | | ELSE() |
| | | |
| | | |
| | | link_directories(${ONNXRUNTIME_DIR}/lib) |
| | | |
| | | link_directories(${ONNXRUNTIME_DIR}/lib) |
| | | endif() |
| | | |
| | | add_subdirectory("./third_party/yaml-cpp") |
| | |
| | | #include <queue> |
| | | #include <stdint.h> |
| | | |
| | | #ifndef model_sample_rate |
| | | #define model_sample_rate 16000 |
| | | #endif |
| | | #ifndef WAV_HEADER_SIZE |
| | | #define WAV_HEADER_SIZE 44 |
| | | #endif |
| | | |
| | | using namespace std; |
| | | |
| | | class AudioFrame { |
| | |
| | | int16_t *speech_buff; |
| | | int speech_len; |
| | | int speech_align_len; |
| | | int16_t sample_rate; |
| | | int offset; |
| | | float align_size; |
| | | int data_type; |
| | |
| | | Audio(int data_type, int size); |
| | | ~Audio(); |
| | | void disp(); |
| | | bool loadwav(const char* filename); |
| | | bool loadwav(const char* buf, int nLen); |
| | | bool loadpcmwav(const char* buf, int nFileLen); |
| | | bool loadpcmwav(const char* filename); |
| | | bool loadwav(const char* filename, int32_t* sampling_rate); |
| | | void wavResample(int32_t sampling_rate, const float *waveform, int32_t n); |
| | | bool loadwav(const char* buf, int nLen, int32_t* sampling_rate); |
| | | bool loadpcmwav(const char* buf, int nFileLen, int32_t* sampling_rate); |
| | | bool loadpcmwav(const char* filename, int32_t* sampling_rate); |
| | | int fetch_chunck(float *&dout, int len); |
| | | int fetch(float *&dout, int &len, int &flag); |
| | | void padding(); |
| | |
| | | // if not give a fnCallback ,it should be NULL |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* szWavfile, FUNASR_MODE Mode, QM_CALLBACK fnCallback); |
| | | |
| | |
| | | #include <stdio.h> |
| | | #include <stdlib.h> |
| | | #include <string.h> |
| | | #include <fstream> |
| | | #include <assert.h> |
| | | |
| | | #include "Audio.h" |
| | | #include "precomp.h" |
| | | |
| | | using namespace std; |
| | | |
| | | // see http://soundfile.sapp.org/doc/WaveFormat/ |
| | | // Note: We assume little endian here |
| | | struct WaveHeader { |
| | | bool Validate() const { |
| | | // F F I R |
| | | if (chunk_id != 0x46464952) { |
| | | printf("Expected chunk_id RIFF. Given: 0x%08x\n", chunk_id); |
| | | return false; |
| | | } |
| | | // E V A W |
| | | if (format != 0x45564157) { |
| | | printf("Expected format WAVE. Given: 0x%08x\n", format); |
| | | return false; |
| | | } |
| | | |
| | | if (subchunk1_id != 0x20746d66) { |
| | | printf("Expected subchunk1_id 0x20746d66. Given: 0x%08x\n", |
| | | subchunk1_id); |
| | | return false; |
| | | } |
| | | |
| | | if (subchunk1_size != 16) { // 16 for PCM |
| | | printf("Expected subchunk1_size 16. Given: %d\n", |
| | | subchunk1_size); |
| | | return false; |
| | | } |
| | | |
| | | if (audio_format != 1) { // 1 for PCM |
| | | printf("Expected audio_format 1. Given: %d\n", audio_format); |
| | | return false; |
| | | } |
| | | |
| | | if (num_channels != 1) { // we support only single channel for now |
| | | printf("Expected single channel. Given: %d\n", num_channels); |
| | | return false; |
| | | } |
| | | if (byte_rate != (sample_rate * num_channels * bits_per_sample / 8)) { |
| | | return false; |
| | | } |
| | | |
| | | if (block_align != (num_channels * bits_per_sample / 8)) { |
| | | return false; |
| | | } |
| | | |
| | | if (bits_per_sample != 16) { // we support only 16 bits per sample |
| | | printf("Expected bits_per_sample 16. Given: %d\n", |
| | | bits_per_sample); |
| | | return false; |
| | | } |
| | | return true; |
| | | } |
| | | |
| | | // See https://en.wikipedia.org/wiki/WAV#Metadata and |
| | | // https://www.robotplanet.dk/audio/wav_meta_data/riff_mci.pdf |
| | | void SeekToDataChunk(std::istream &is) { |
| | | // a t a d |
| | | while (is && subchunk2_id != 0x61746164) { |
| | | // const char *p = reinterpret_cast<const char *>(&subchunk2_id); |
| | | // printf("Skip chunk (%x): %c%c%c%c of size: %d\n", subchunk2_id, p[0], |
| | | // p[1], p[2], p[3], subchunk2_size); |
| | | is.seekg(subchunk2_size, std::istream::cur); |
| | | is.read(reinterpret_cast<char *>(&subchunk2_id), sizeof(int32_t)); |
| | | is.read(reinterpret_cast<char *>(&subchunk2_size), sizeof(int32_t)); |
| | | } |
| | | } |
| | | |
| | | int32_t chunk_id; |
| | | int32_t chunk_size; |
| | | int32_t format; |
| | | int32_t subchunk1_id; |
| | | int32_t subchunk1_size; |
| | | int16_t audio_format; |
| | | int16_t num_channels; |
| | | int32_t sample_rate; |
| | | int32_t byte_rate; |
| | | int16_t block_align; |
| | | int16_t bits_per_sample; |
| | | int32_t subchunk2_id; // a tag of this chunk |
| | | int32_t subchunk2_size; // size of subchunk2 |
| | | }; |
| | | static_assert(sizeof(WaveHeader) == WAV_HEADER_SIZE, ""); |
| | | |
| | | class AudioWindow { |
| | | private: |
| | |
| | | float frame_length = 400; |
| | | float frame_shift = 160; |
| | | float num_new_samples = |
| | | ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length; |
| | | ceil((num_samples - frame_length) / frame_shift) * frame_shift + frame_length; |
| | | |
| | | end = start + num_new_samples; |
| | | len = (int)num_new_samples; |
| | |
| | | |
| | | void Audio::disp() |
| | | { |
| | | printf("Audio time is %f s. len is %d\n", (float)speech_len / 16000, |
| | | printf("Audio time is %f s. len is %d\n", (float)speech_len / model_sample_rate, |
| | | speech_len); |
| | | } |
| | | |
| | | float Audio::get_time_len() |
| | | { |
| | | return (float)speech_len / 16000; |
| | | //speech_len); |
| | | return (float)speech_len / model_sample_rate; |
| | | } |
| | | |
| | | bool Audio::loadwav(const char *filename) |
| | | void Audio::wavResample(int32_t sampling_rate, const float *waveform, |
| | | int32_t n) |
| | | { |
| | | printf( |
| | | "Creating a resampler:\n" |
| | | " in_sample_rate: %d\n" |
| | | " output_sample_rate: %d\n", |
| | | sampling_rate, static_cast<int32_t>(model_sample_rate)); |
| | | float min_freq = |
| | | std::min<int32_t>(sampling_rate, model_sample_rate); |
| | | float lowpass_cutoff = 0.99 * 0.5 * min_freq; |
| | | |
| | | int32_t lowpass_filter_width = 6; |
| | | //FIXME |
| | | //auto resampler = new LinearResample( |
| | | // sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); |
| | | auto resampler = std::make_unique<LinearResample>( |
| | | sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); |
| | | std::vector<float> samples; |
| | | resampler->Resample(waveform, n, true, &samples); |
| | | //reset speech_data |
| | | speech_len = samples.size(); |
| | | if (speech_data != NULL) { |
| | | free(speech_data); |
| | | } |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | copy(samples.begin(), samples.end(), speech_data); |
| | | } |
| | | |
| | | bool Audio::loadwav(const char *filename, int32_t* sampling_rate) |
| | | { |
| | | WaveHeader header; |
| | | if (speech_data != NULL) { |
| | | free(speech_data); |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | |
| | | |
| | | offset = 0; |
| | | |
| | | FILE *fp; |
| | | fp = fopen(filename, "rb"); |
| | | if (fp == nullptr) |
| | | std::ifstream is(filename, std::ifstream::binary); |
| | | is.read(reinterpret_cast<char *>(&header), sizeof(header)); |
| | | if(!is){ |
| | | fprintf(stderr, "Failed to read %s\n", filename); |
| | | return false; |
| | | fseek(fp, 0, SEEK_END); /*定位到文件末尾*/ |
| | | uint32_t nFileLen = ftell(fp); /*得到文件大小*/ |
| | | fseek(fp, 44, SEEK_SET); /*跳过wav文件头*/ |
| | | |
| | | speech_len = (nFileLen - 44) / 2; |
| | | speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); |
| | | speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len); |
| | | } |
| | | |
| | | *sampling_rate = header.sample_rate; |
| | | // header.subchunk2_size contains the number of bytes in the data. |
| | | // As we assume each sample contains two bytes, so it is divided by 2 here |
| | | speech_len = header.subchunk2_size / 2; |
| | | speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); |
| | | |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); |
| | | int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp); |
| | | fclose(fp); |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | is.read(reinterpret_cast<char *>(speech_buff), header.subchunk2_size); |
| | | if (!is) { |
| | | fprintf(stderr, "Failed to read %s\n", filename); |
| | | return false; |
| | | } |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_align_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_align_len); |
| | | int i; |
| | | float scale = 1; |
| | | |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | } |
| | | |
| | | for (i = 0; i < speech_len; i++) { |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != model_sample_rate){ |
| | | wavResample(*sampling_rate, speech_data, speech_len); |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | |
| | | return true; |
| | | } |
| | |
| | | return false; |
| | | } |
| | | |
| | | |
| | | bool Audio::loadwav(const char* buf, int nFileLen) |
| | | bool Audio::loadwav(const char* buf, int nFileLen, int32_t* sampling_rate) |
| | | { |
| | | |
| | | |
| | | |
| | | WaveHeader header; |
| | | if (speech_data != NULL) { |
| | | free(speech_data); |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | |
| | | offset = 0; |
| | | |
| | | size_t nOffset = 0; |
| | | std::memcpy(&header, buf, sizeof(header)); |
| | | |
| | | #define WAV_HEADER_SIZE 44 |
| | | |
| | | speech_len = (nFileLen - WAV_HEADER_SIZE) / 2; |
| | | speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); |
| | | *sampling_rate = header.sample_rate; |
| | | speech_len = header.subchunk2_size / 2; |
| | | speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)(buf + WAV_HEADER_SIZE), speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_align_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_align_len); |
| | | int i; |
| | | float scale = 1; |
| | | |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | } |
| | | |
| | | for (i = 0; i < speech_len; i++) { |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != model_sample_rate){ |
| | | wavResample(*sampling_rate, speech_data, speech_len); |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | return true; |
| | | } |
| | | else |
| | | return false; |
| | | |
| | | } |
| | | |
| | | |
| | | bool Audio::loadpcmwav(const char* buf, int nBufLen) |
| | | bool Audio::loadpcmwav(const char* buf, int nBufLen, int32_t* sampling_rate) |
| | | { |
| | | if (speech_data != NULL) { |
| | | free(speech_data); |
| | |
| | | } |
| | | offset = 0; |
| | | |
| | | size_t nOffset = 0; |
| | | |
| | | |
| | | |
| | | speech_len = nBufLen / 2; |
| | | speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_align_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_align_len); |
| | | |
| | | |
| | | int i; |
| | | float scale = 1; |
| | | |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | } |
| | | |
| | | for (i = 0; i < speech_len; i++) { |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != model_sample_rate){ |
| | | wavResample(*sampling_rate, speech_data, speech_len); |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | |
| | | } |
| | | else |
| | | return false; |
| | | |
| | | |
| | | } |
| | | |
| | | bool Audio::loadpcmwav(const char* filename) |
| | | bool Audio::loadpcmwav(const char* filename, int32_t* sampling_rate) |
| | | { |
| | | |
| | | if (speech_data != NULL) { |
| | | free(speech_data); |
| | | } |
| | |
| | | fseek(fp, 0, SEEK_SET); |
| | | |
| | | speech_len = (nFileLen) / 2; |
| | | speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp); |
| | | fclose(fp); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_align_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_align_len); |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | |
| | | |
| | | int i; |
| | | float scale = 1; |
| | | |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | } |
| | | |
| | | for (i = 0; i < speech_len; i++) { |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != model_sample_rate){ |
| | | wavResample(*sampling_rate, speech_data, speech_len); |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | |
| | | return true; |
| | | } |
| | |
| | | return false; |
| | | |
| | | } |
| | | |
| | | |
| | | int Audio::fetch_chunck(float *&dout, int len) |
| | | { |
| | |
| | | |
| | | file(GLOB files1 "*.cpp") |
| | | file(GLOB files2 "*.cc") |
| | | file(GLOB files4 "paraformer/*.cpp") |
| | | |
| | | set(files ${files1} ${files2} ${files3} ${files4}) |
| | |
| | | { |
| | | ifstream in(filename); |
| | | loadVocabFromYaml(filename); |
| | | |
| | | /* |
| | | string line; |
| | | if (in) // 有该文件 |
| | | { |
| | | while (getline(in, line)) // line中不包括每行的换行符 |
| | | { |
| | | vocab.push_back(line); |
| | | } |
| | | } |
| | | else{ |
| | | printf("Cannot load vocab from: %s, there must be file vocab.txt", filename); |
| | | exit(-1); |
| | | } |
| | | */ |
| | | } |
| | | Vocab::~Vocab() |
| | | { |
| | |
| | | if (!pRecogObj) |
| | | return nullptr; |
| | | |
| | | int32_t sampling_rate = -1; |
| | | Audio audio(1); |
| | | if (!audio.loadwav(szBuf, nLen)) |
| | | if (!audio.loadwav(szBuf, nLen, &sampling_rate)) |
| | | return nullptr; |
| | | //audio.split(); |
| | | |
| | |
| | | return pResult; |
| | | } |
| | | |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback) |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback) |
| | | { |
| | | Model* pRecogObj = (Model*)handle; |
| | | if (!pRecogObj) |
| | | return nullptr; |
| | | |
| | | Audio audio(1); |
| | | if (!audio.loadpcmwav(szBuf, nLen)) |
| | | if (!audio.loadpcmwav(szBuf, nLen, &sampling_rate)) |
| | | return nullptr; |
| | | //audio.split(); |
| | | |
| | |
| | | return pResult; |
| | | } |
| | | |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback) |
| | | _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback) |
| | | { |
| | | Model* pRecogObj = (Model*)handle; |
| | | if (!pRecogObj) |
| | | return nullptr; |
| | | |
| | | Audio audio(1); |
| | | if (!audio.loadpcmwav(szFileName)) |
| | | if (!audio.loadpcmwav(szFileName, &sampling_rate)) |
| | | return nullptr; |
| | | //audio.split(); |
| | | |
| | |
| | | Model* pRecogObj = (Model*)handle; |
| | | if (!pRecogObj) |
| | | return nullptr; |
| | | |
| | | |
| | | int32_t sampling_rate = -1; |
| | | Audio audio(1); |
| | | if(!audio.loadwav(szWavfile)) |
| | | if(!audio.loadwav(szWavfile, &sampling_rate)) |
| | | return nullptr; |
| | | //audio.split(); |
| | | |
| | |
| | | |
| | | void ModelImp::reset() |
| | | { |
| | | printf("Not Imp!!!!!!\n"); |
| | | } |
| | | |
| | | void ModelImp::apply_lfr(Tensor<float>*& din) |
| | |
| | | #include "FeatureQueue.h" |
| | | #include "SpeechWrap.h" |
| | | #include <Audio.h> |
| | | #include "resample.h" |
| | | #include "Model.h" |
| | | #include "paraformer_onnx.h" |
| | | #include "libfunasrapi.h" |
| New file |
| | |
| | | /** |
| | | * Copyright 2013 Pegah Ghahremani |
| | | * 2014 IMSL, PKU-HKUST (author: Wei Shi) |
| | | * 2014 Yanqing Sun, Junjie Wang |
| | | * 2014 Johns Hopkins University (author: Daniel Povey) |
| | | * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) |
| | | * |
| | | * See LICENSE for clarification regarding multiple authors |
| | | * |
| | | * Licensed under the Apache License, Version 2.0 (the "License"); |
| | | * you may not use this file except in compliance with the License. |
| | | * You may obtain a copy of the License at |
| | | * |
| | | * http://www.apache.org/licenses/LICENSE-2.0 |
| | | * |
| | | * Unless required by applicable law or agreed to in writing, software |
| | | * distributed under the License is distributed on an "AS IS" BASIS, |
| | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| | | * See the License for the specific language governing permissions and |
| | | * limitations under the License. |
| | | */ |
| | | // this file is copied and modified from |
| | | // kaldi/src/feat/resample.cc |
| | | |
| | | #include "resample.h" |
| | | |
| | | #include <assert.h> |
| | | #include <math.h> |
| | | #include <stdio.h> |
| | | |
| | | #include <cstdlib> |
| | | #include <type_traits> |
| | | |
| | | #ifndef M_2PI |
| | | #define M_2PI 6.283185307179586476925286766559005 |
| | | #endif |
| | | |
| | | #ifndef M_PI |
| | | #define M_PI 3.1415926535897932384626433832795 |
| | | #endif |
| | | |
| | | template <class I> |
| | | I Gcd(I m, I n) { |
| | | // this function is copied from kaldi/src/base/kaldi-math.h |
| | | if (m == 0 || n == 0) { |
| | | if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. |
| | | fprintf(stderr, "Undefined GCD since m = 0, n = 0.\n"); |
| | | exit(-1); |
| | | } |
| | | return (m == 0 ? (n > 0 ? n : -n) : (m > 0 ? m : -m)); |
| | | // return absolute value of whichever is nonzero |
| | | } |
| | | // could use compile-time assertion |
| | | // but involves messing with complex template stuff. |
| | | static_assert(std::is_integral<I>::value, ""); |
| | | while (1) { |
| | | m %= n; |
| | | if (m == 0) return (n > 0 ? n : -n); |
| | | n %= m; |
| | | if (n == 0) return (m > 0 ? m : -m); |
| | | } |
| | | } |
| | | |
| | | /// Returns the least common multiple of two integers. Will |
| | | /// crash unless the inputs are positive. |
| | | template <class I> |
| | | I Lcm(I m, I n) { |
| | | // This function is copied from kaldi/src/base/kaldi-math.h |
| | | assert(m > 0 && n > 0); |
| | | I gcd = Gcd(m, n); |
| | | return gcd * (m / gcd) * (n / gcd); |
| | | } |
| | | |
| | | static float DotProduct(const float *a, const float *b, int32_t n) { |
| | | float sum = 0; |
| | | for (int32_t i = 0; i != n; ++i) { |
| | | sum += a[i] * b[i]; |
| | | } |
| | | return sum; |
| | | } |
| | | |
| | | LinearResample::LinearResample(int32_t samp_rate_in_hz, |
| | | int32_t samp_rate_out_hz, float filter_cutoff_hz, |
| | | int32_t num_zeros) |
| | | : samp_rate_in_(samp_rate_in_hz), |
| | | samp_rate_out_(samp_rate_out_hz), |
| | | filter_cutoff_(filter_cutoff_hz), |
| | | num_zeros_(num_zeros) { |
| | | assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 && |
| | | filter_cutoff_hz > 0.0 && filter_cutoff_hz * 2 <= samp_rate_in_hz && |
| | | filter_cutoff_hz * 2 <= samp_rate_out_hz && num_zeros > 0); |
| | | |
| | | // base_freq is the frequency of the repeating unit, which is the gcd |
| | | // of the input frequencies. |
| | | int32_t base_freq = Gcd(samp_rate_in_, samp_rate_out_); |
| | | input_samples_in_unit_ = samp_rate_in_ / base_freq; |
| | | output_samples_in_unit_ = samp_rate_out_ / base_freq; |
| | | |
| | | SetIndexesAndWeights(); |
| | | Reset(); |
| | | } |
| | | |
| | | void LinearResample::SetIndexesAndWeights() { |
| | | first_index_.resize(output_samples_in_unit_); |
| | | weights_.resize(output_samples_in_unit_); |
| | | |
| | | double window_width = num_zeros_ / (2.0 * filter_cutoff_); |
| | | |
| | | for (int32_t i = 0; i < output_samples_in_unit_; i++) { |
| | | double output_t = i / static_cast<double>(samp_rate_out_); |
| | | double min_t = output_t - window_width, max_t = output_t + window_width; |
| | | // we do ceil on the min and floor on the max, because if we did it |
| | | // the other way around we would unnecessarily include indexes just |
| | | // outside the window, with zero coefficients. It's possible |
| | | // if the arguments to the ceil and floor expressions are integers |
| | | // (e.g. if filter_cutoff_ has an exact ratio with the sample rates), |
| | | // that we unnecessarily include something with a zero coefficient, |
| | | // but this is only a slight efficiency issue. |
| | | int32_t min_input_index = ceil(min_t * samp_rate_in_), |
| | | max_input_index = floor(max_t * samp_rate_in_), |
| | | num_indices = max_input_index - min_input_index + 1; |
| | | first_index_[i] = min_input_index; |
| | | weights_[i].resize(num_indices); |
| | | for (int32_t j = 0; j < num_indices; j++) { |
| | | int32_t input_index = min_input_index + j; |
| | | double input_t = input_index / static_cast<double>(samp_rate_in_), |
| | | delta_t = input_t - output_t; |
| | | // sign of delta_t doesn't matter. |
| | | weights_[i][j] = FilterFunc(delta_t) / samp_rate_in_; |
| | | } |
| | | } |
| | | } |
| | | |
| | | /** Here, t is a time in seconds representing an offset from |
| | | the center of the windowed filter function, and FilterFunction(t) |
| | | returns the windowed filter function, described |
| | | in the header as h(t) = f(t)g(t), evaluated at t. |
| | | */ |
| | | float LinearResample::FilterFunc(float t) const { |
| | | float window, // raised-cosine (Hanning) window of width |
| | | // num_zeros_/2*filter_cutoff_ |
| | | filter; // sinc filter function |
| | | if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) |
| | | window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); |
| | | else |
| | | window = 0.0; // outside support of window function |
| | | if (t != 0) |
| | | filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t); |
| | | else |
| | | filter = 2 * filter_cutoff_; // limit of the function at t = 0 |
| | | return filter * window; |
| | | } |
| | | |
| | | void LinearResample::Reset() { |
| | | input_sample_offset_ = 0; |
| | | output_sample_offset_ = 0; |
| | | input_remainder_.resize(0); |
| | | } |
| | | |
| | | void LinearResample::Resample(const float *input, int32_t input_dim, bool flush, |
| | | std::vector<float> *output) { |
| | | int64_t tot_input_samp = input_sample_offset_ + input_dim, |
| | | tot_output_samp = GetNumOutputSamples(tot_input_samp, flush); |
| | | |
| | | assert(tot_output_samp >= output_sample_offset_); |
| | | |
| | | output->resize(tot_output_samp - output_sample_offset_); |
| | | |
| | | // samp_out is the index into the total output signal, not just the part |
| | | // of it we are producing here. |
| | | for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp; |
| | | samp_out++) { |
| | | int64_t first_samp_in; |
| | | int32_t samp_out_wrapped; |
| | | GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped); |
| | | const std::vector<float> &weights = weights_[samp_out_wrapped]; |
| | | // first_input_index is the first index into "input" that we have a weight |
| | | // for. |
| | | int32_t first_input_index = |
| | | static_cast<int32_t>(first_samp_in - input_sample_offset_); |
| | | float this_output; |
| | | if (first_input_index >= 0 && |
| | | first_input_index + static_cast<int32_t>(weights.size()) <= input_dim) { |
| | | this_output = |
| | | DotProduct(input + first_input_index, weights.data(), weights.size()); |
| | | } else { // Handle edge cases. |
| | | this_output = 0.0; |
| | | for (int32_t i = 0; i < static_cast<int32_t>(weights.size()); i++) { |
| | | float weight = weights[i]; |
| | | int32_t input_index = first_input_index + i; |
| | | if (input_index < 0 && |
| | | static_cast<int32_t>(input_remainder_.size()) + input_index >= 0) { |
| | | this_output += |
| | | weight * input_remainder_[input_remainder_.size() + input_index]; |
| | | } else if (input_index >= 0 && input_index < input_dim) { |
| | | this_output += weight * input[input_index]; |
| | | } else if (input_index >= input_dim) { |
| | | // We're past the end of the input and are adding zero; should only |
| | | // happen if the user specified flush == true, or else we would not |
| | | // be trying to output this sample. |
| | | assert(flush); |
| | | } |
| | | } |
| | | } |
| | | int32_t output_index = |
| | | static_cast<int32_t>(samp_out - output_sample_offset_); |
| | | (*output)[output_index] = this_output; |
| | | } |
| | | |
| | | if (flush) { |
| | | Reset(); // Reset the internal state. |
| | | } else { |
| | | SetRemainder(input, input_dim); |
| | | input_sample_offset_ = tot_input_samp; |
| | | output_sample_offset_ = tot_output_samp; |
| | | } |
| | | } |
| | | |
| | | int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp, |
| | | bool flush) const { |
| | | // For exact computation, we measure time in "ticks" of 1.0 / tick_freq, |
| | | // where tick_freq is the least common multiple of samp_rate_in_ and |
| | | // samp_rate_out_. |
| | | int32_t tick_freq = Lcm(samp_rate_in_, samp_rate_out_); |
| | | int32_t ticks_per_input_period = tick_freq / samp_rate_in_; |
| | | |
| | | // work out the number of ticks in the time interval |
| | | // [ 0, input_num_samp/samp_rate_in_ ). |
| | | int64_t interval_length_in_ticks = input_num_samp * ticks_per_input_period; |
| | | if (!flush) { |
| | | float window_width = num_zeros_ / (2.0 * filter_cutoff_); |
| | | // To count the window-width in ticks we take the floor. This |
| | | // is because since we're looking for the largest integer num-out-samp |
| | | // that fits in the interval, which is open on the right, a reduction |
| | | // in interval length of less than a tick will never make a difference. |
| | | // For example, the largest integer in the interval [ 0, 2 ) and the |
| | | // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one). |
| | | // So when we're subtracting the window-width we can ignore the fractional |
| | | // part. |
| | | int32_t window_width_ticks = floor(window_width * tick_freq); |
| | | // The time-period of the output that we can sample gets reduced |
| | | // by the window-width (which is actually the distance from the |
| | | // center to the edge of the windowing function) if we're not |
| | | // "flushing the output". |
| | | interval_length_in_ticks -= window_width_ticks; |
| | | } |
| | | if (interval_length_in_ticks <= 0) return 0; |
| | | |
| | | int32_t ticks_per_output_period = tick_freq / samp_rate_out_; |
| | | // Get the last output-sample in the closed interval, i.e. replacing [ ) with |
| | | // [ ]. Note: integer division rounds down. See |
| | | // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of |
| | | // the notation. |
| | | int64_t last_output_samp = interval_length_in_ticks / ticks_per_output_period; |
| | | // We need the last output-sample in the open interval, so if it takes us to |
| | | // the end of the interval exactly, subtract one. |
| | | if (last_output_samp * ticks_per_output_period == interval_length_in_ticks) |
| | | last_output_samp--; |
| | | |
| | | // First output-sample index is zero, so the number of output samples |
| | | // is the last output-sample plus one. |
| | | int64_t num_output_samp = last_output_samp + 1; |
| | | return num_output_samp; |
| | | } |
| | | |
| | | // inline |
| | | void LinearResample::GetIndexes(int64_t samp_out, int64_t *first_samp_in, |
| | | int32_t *samp_out_wrapped) const { |
| | | // A unit is the smallest nonzero amount of time that is an exact |
| | | // multiple of the input and output sample periods. The unit index |
| | | // is the answer to "which numbered unit we are in". |
| | | int64_t unit_index = samp_out / output_samples_in_unit_; |
| | | // samp_out_wrapped is equal to samp_out % output_samples_in_unit_ |
| | | *samp_out_wrapped = |
| | | static_cast<int32_t>(samp_out - unit_index * output_samples_in_unit_); |
| | | *first_samp_in = |
| | | first_index_[*samp_out_wrapped] + unit_index * input_samples_in_unit_; |
| | | } |
| | | |
| | | void LinearResample::SetRemainder(const float *input, int32_t input_dim) { |
| | | std::vector<float> old_remainder(input_remainder_); |
| | | // max_remainder_needed is the width of the filter from side to side, |
| | | // measured in input samples. you might think it should be half that, |
| | | // but you have to consider that you might be wanting to output samples |
| | | // that are "in the past" relative to the beginning of the latest |
| | | // input... anyway, storing more remainder than needed is not harmful. |
| | | int32_t max_remainder_needed = |
| | | ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_); |
| | | input_remainder_.resize(max_remainder_needed); |
| | | for (int32_t index = -static_cast<int32_t>(input_remainder_.size()); |
| | | index < 0; index++) { |
| | | // we interpret "index" as an offset from the end of "input" and |
| | | // from the end of input_remainder_. |
| | | int32_t input_index = index + input_dim; |
| | | if (input_index >= 0) { |
| | | input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] = |
| | | input[input_index]; |
| | | } else if (input_index + static_cast<int32_t>(old_remainder.size()) >= 0) { |
| | | input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] = |
| | | old_remainder[input_index + |
| | | static_cast<int32_t>(old_remainder.size())]; |
| | | // else leave it at zero. |
| | | } |
| | | } |
| | | } |
| New file |
| | |
| | | /** |
| | | * Copyright 2013 Pegah Ghahremani |
| | | * 2014 IMSL, PKU-HKUST (author: Wei Shi) |
| | | * 2014 Yanqing Sun, Junjie Wang |
| | | * 2014 Johns Hopkins University (author: Daniel Povey) |
| | | * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) |
| | | * |
| | | * See LICENSE for clarification regarding multiple authors |
| | | * |
| | | * Licensed under the Apache License, Version 2.0 (the "License"); |
| | | * you may not use this file except in compliance with the License. |
| | | * You may obtain a copy of the License at |
| | | * |
| | | * http://www.apache.org/licenses/LICENSE-2.0 |
| | | * |
| | | * Unless required by applicable law or agreed to in writing, software |
| | | * distributed under the License is distributed on an "AS IS" BASIS, |
| | | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| | | * See the License for the specific language governing permissions and |
| | | * limitations under the License. |
| | | */ |
| | | // this file is copied and modified from |
| | | // kaldi/src/feat/resample.h |
| | | |
| | | #include <cstdint> |
| | | #include <vector> |
| | | |
| | | |
| | | /* |
| | | We require that the input and output sampling rate be specified as |
| | | integers, as this is an easy way to specify that their ratio be rational. |
| | | */ |
| | | |
| | | class LinearResample { |
| | | public: |
| | | /// Constructor. We make the input and output sample rates integers, because |
| | | /// we are going to need to find a common divisor. This should just remind |
| | | /// you that they need to be integers. The filter cutoff needs to be less |
| | | /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros |
| | | /// controls the sharpness of the filter, more == sharper but less efficient. |
| | | /// We suggest around 4 to 10 for normal use. |
| | | LinearResample(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, |
| | | float filter_cutoff_hz, int32_t num_zeros); |
| | | |
| | | /// Calling the function Reset() resets the state of the object prior to |
| | | /// processing a new signal; it is only necessary if you have called |
| | | /// Resample(x, x_size, false, y) for some signal, leading to a remainder of |
| | | /// the signal being called, but then abandon processing the signal before |
| | | /// calling Resample(x, x_size, true, y) for the last piece. Call it |
| | | /// unnecessarily between signals will not do any harm. |
| | | void Reset(); |
| | | |
| | | /// This function does the resampling. If you call it with flush == true and |
| | | /// you have never called it with flush == false, it just resamples the input |
| | | /// signal (it resizes the output to a suitable number of samples). |
| | | /// |
| | | /// You can also use this function to process a signal a piece at a time. |
| | | /// suppose you break it into piece1, piece2, ... pieceN. You can call |
| | | /// \code{.cc} |
| | | /// Resample(piece1, piece1_size, false, &output1); |
| | | /// Resample(piece2, piece2_size, false, &output2); |
| | | /// Resample(piece3, piece3_size, true, &output3); |
| | | /// \endcode |
| | | /// If you call it with flush == false, it won't output the last few samples |
| | | /// but will remember them, so that if you later give it a second piece of |
| | | /// the input signal it can process it correctly. |
| | | /// If your most recent call to the object was with flush == false, it will |
| | | /// have internal state; you can remove this by calling Reset(). |
| | | /// Empty input is acceptable. |
| | | void Resample(const float *input, int32_t input_dim, bool flush, |
| | | std::vector<float> *output); |
| | | |
| | | //// Return the input and output sampling rates (for checks, for example) |
| | | int32_t GetInputSamplingRate() const { return samp_rate_in_; } |
| | | int32_t GetOutputSamplingRate() const { return samp_rate_out_; } |
| | | |
| | | private: |
| | | void SetIndexesAndWeights(); |
| | | |
| | | float FilterFunc(float) const; |
| | | |
| | | /// This function outputs the number of output samples we will output |
| | | /// for a signal with "input_num_samp" input samples. If flush == true, |
| | | /// we return the largest n such that |
| | | /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ), |
| | | /// and note that the interval is half-open. If flush == false, |
| | | /// define window_width as num_zeros / (2.0 * filter_cutoff_); |
| | | /// we return the largest n such that (n/samp_rate_out_) is in the interval |
| | | /// [ 0, input_num_samp/samp_rate_in_ - window_width ). |
| | | int64_t GetNumOutputSamples(int64_t input_num_samp, bool flush) const; |
| | | |
| | | /// Given an output-sample index, this function outputs to *first_samp_in the |
| | | /// first input-sample index that we have a weight on (may be negative), |
| | | /// and to *samp_out_wrapped the index into weights_ where we can get the |
| | | /// corresponding weights on the input. |
| | | inline void GetIndexes(int64_t samp_out, int64_t *first_samp_in, |
| | | int32_t *samp_out_wrapped) const; |
| | | |
| | | void SetRemainder(const float *input, int32_t input_dim); |
| | | |
| | | private: |
| | | // The following variables are provided by the user. |
| | | int32_t samp_rate_in_; |
| | | int32_t samp_rate_out_; |
| | | float filter_cutoff_; |
| | | int32_t num_zeros_; |
| | | |
| | | int32_t input_samples_in_unit_; ///< The number of input samples in the |
| | | ///< smallest repeating unit: num_samp_in_ = |
| | | ///< samp_rate_in_hz / Gcd(samp_rate_in_hz, |
| | | ///< samp_rate_out_hz) |
| | | |
| | | int32_t output_samples_in_unit_; ///< The number of output samples in the |
| | | ///< smallest repeating unit: num_samp_out_ |
| | | ///< = samp_rate_out_hz / |
| | | ///< Gcd(samp_rate_in_hz, samp_rate_out_hz) |
| | | |
| | | /// The first input-sample index that we sum over, for this output-sample |
| | | /// index. May be negative; any truncation at the beginning is handled |
| | | /// separately. This is just for the first few output samples, but we can |
| | | /// extrapolate the correct input-sample index for arbitrary output samples. |
| | | std::vector<int32_t> first_index_; |
| | | |
| | | /// Weights on the input samples, for this output-sample index. |
| | | std::vector<std::vector<float>> weights_; |
| | | |
| | | // the following variables keep track of where we are in a particular signal, |
| | | // if it is being provided over multiple calls to Resample(). |
| | | |
| | | int64_t input_sample_offset_; ///< The number of input samples we have |
| | | ///< already received for this signal |
| | | ///< (including anything in remainder_) |
| | | int64_t output_sample_offset_; ///< The number of samples we have already |
| | | ///< output for this signal. |
| | | std::vector<float> input_remainder_; ///< A small trailing part of the |
| | | ///< previously seen input signal. |
| | | }; |