From 1a6242fd4fb9e3b3827908520c876ee541b60af5 Mon Sep 17 00:00:00 2001
From: lyblsgo <lyblsgo@163.com>
Date: 星期五, 14 四月 2023 13:08:33 +0800
Subject: [PATCH] support arbitrary sampling rate
---
funasr/runtime/onnxruntime/src/CMakeLists.txt | 1
funasr/runtime/onnxruntime/src/Audio.cpp | 262 ++++++++++++++------
funasr/runtime/onnxruntime/src/libfunasrapi.cpp | 16
funasr/runtime/onnxruntime/CMakeLists.txt | 19
funasr/runtime/onnxruntime/src/paraformer_onnx.cpp | 1
funasr/runtime/onnxruntime/src/precomp.h | 1
funasr/runtime/onnxruntime/src/resample.h | 137 ++++++++++
funasr/runtime/onnxruntime/src/resample.cc | 305 +++++++++++++++++++++++
funasr/runtime/onnxruntime/include/libfunasrapi.h | 4
funasr/runtime/onnxruntime/include/Audio.h | 17
funasr/runtime/onnxruntime/src/Vocab.cpp | 15 -
11 files changed, 661 insertions(+), 117 deletions(-)
diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt
index 4ffe0f3..6feef92 100644
--- a/funasr/runtime/onnxruntime/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/CMakeLists.txt
@@ -2,24 +2,27 @@
project(FunASRonnx)
-set(CMAKE_CXX_STANDARD 11)
+# set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+include(TestBigEndian)
+test_big_endian(BIG_ENDIAN)
+if(BIG_ENDIAN)
+ message("Big endian system")
+else()
+ message("Little endian system")
+endif()
+
# for onnxruntime
-
IF(WIN32)
-
-
if(CMAKE_CL_64)
link_directories(${ONNXRUNTIME_DIR}\\lib)
else()
add_definitions(-D_WIN_X86)
endif()
ELSE()
-
-
-link_directories(${ONNXRUNTIME_DIR}/lib)
-
+ link_directories(${ONNXRUNTIME_DIR}/lib)
endif()
add_subdirectory("./third_party/yaml-cpp")
diff --git a/funasr/runtime/onnxruntime/include/Audio.h b/funasr/runtime/onnxruntime/include/Audio.h
index da5e82c..ec49a9f 100644
--- a/funasr/runtime/onnxruntime/include/Audio.h
+++ b/funasr/runtime/onnxruntime/include/Audio.h
@@ -6,6 +6,13 @@
#include <queue>
#include <stdint.h>
+#ifndef model_sample_rate
+#define model_sample_rate 16000
+#endif
+#ifndef WAV_HEADER_SIZE
+#define WAV_HEADER_SIZE 44
+#endif
+
using namespace std;
class AudioFrame {
@@ -32,7 +39,6 @@
int16_t *speech_buff;
int speech_len;
int speech_align_len;
- int16_t sample_rate;
int offset;
float align_size;
int data_type;
@@ -43,10 +49,11 @@
Audio(int data_type, int size);
~Audio();
void disp();
- bool loadwav(const char* filename);
- bool loadwav(const char* buf, int nLen);
- bool loadpcmwav(const char* buf, int nFileLen);
- bool loadpcmwav(const char* filename);
+ bool loadwav(const char* filename, int32_t* sampling_rate);
+ void wavResample(int32_t sampling_rate, const float *waveform, int32_t n);
+ bool loadwav(const char* buf, int nLen, int32_t* sampling_rate);
+ bool loadpcmwav(const char* buf, int nFileLen, int32_t* sampling_rate);
+ bool loadpcmwav(const char* filename, int32_t* sampling_rate);
int fetch_chunck(float *&dout, int len);
int fetch(float *&dout, int &len, int &flag);
void padding();
diff --git a/funasr/runtime/onnxruntime/include/libfunasrapi.h b/funasr/runtime/onnxruntime/include/libfunasrapi.h
index 6e81fa9..9bc37e7 100644
--- a/funasr/runtime/onnxruntime/include/libfunasrapi.h
+++ b/funasr/runtime/onnxruntime/include/libfunasrapi.h
@@ -55,9 +55,9 @@
// if not give a fnCallback ,it should be NULL
_FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
-_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
+_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
-_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
+_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
_FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* szWavfile, FUNASR_MODE Mode, QM_CALLBACK fnCallback);
diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
index bce3a90..38b6de8 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -3,10 +3,95 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <fstream>
+#include <assert.h>
#include "Audio.h"
+#include "precomp.h"
using namespace std;
+
+// see http://soundfile.sapp.org/doc/WaveFormat/
+// Note: We assume little endian here
+struct WaveHeader {
+ bool Validate() const {
+ // F F I R
+ if (chunk_id != 0x46464952) {
+ printf("Expected chunk_id RIFF. Given: 0x%08x\n", chunk_id);
+ return false;
+ }
+ // E V A W
+ if (format != 0x45564157) {
+ printf("Expected format WAVE. Given: 0x%08x\n", format);
+ return false;
+ }
+
+ if (subchunk1_id != 0x20746d66) {
+ printf("Expected subchunk1_id 0x20746d66. Given: 0x%08x\n",
+ subchunk1_id);
+ return false;
+ }
+
+ if (subchunk1_size != 16) { // 16 for PCM
+ printf("Expected subchunk1_size 16. Given: %d\n",
+ subchunk1_size);
+ return false;
+ }
+
+ if (audio_format != 1) { // 1 for PCM
+ printf("Expected audio_format 1. Given: %d\n", audio_format);
+ return false;
+ }
+
+ if (num_channels != 1) { // we support only single channel for now
+ printf("Expected single channel. Given: %d\n", num_channels);
+ return false;
+ }
+ if (byte_rate != (sample_rate * num_channels * bits_per_sample / 8)) {
+ return false;
+ }
+
+ if (block_align != (num_channels * bits_per_sample / 8)) {
+ return false;
+ }
+
+ if (bits_per_sample != 16) { // we support only 16 bits per sample
+ printf("Expected bits_per_sample 16. Given: %d\n",
+ bits_per_sample);
+ return false;
+ }
+ return true;
+ }
+
+ // See https://en.wikipedia.org/wiki/WAV#Metadata and
+ // https://www.robotplanet.dk/audio/wav_meta_data/riff_mci.pdf
+ void SeekToDataChunk(std::istream &is) {
+ // a t a d
+ while (is && subchunk2_id != 0x61746164) {
+ // const char *p = reinterpret_cast<const char *>(&subchunk2_id);
+ // printf("Skip chunk (%x): %c%c%c%c of size: %d\n", subchunk2_id, p[0],
+ // p[1], p[2], p[3], subchunk2_size);
+ is.seekg(subchunk2_size, std::istream::cur);
+ is.read(reinterpret_cast<char *>(&subchunk2_id), sizeof(int32_t));
+ is.read(reinterpret_cast<char *>(&subchunk2_size), sizeof(int32_t));
+ }
+ }
+
+ int32_t chunk_id;
+ int32_t chunk_size;
+ int32_t format;
+ int32_t subchunk1_id;
+ int32_t subchunk1_size;
+ int16_t audio_format;
+ int16_t num_channels;
+ int32_t sample_rate;
+ int32_t byte_rate;
+ int16_t block_align;
+ int16_t bits_per_sample;
+ int32_t subchunk2_id; // a tag of this chunk
+ int32_t subchunk2_size; // size of subchunk2
+};
+static_assert(sizeof(WaveHeader) == WAV_HEADER_SIZE, "");
class AudioWindow {
private:
@@ -56,7 +141,7 @@
float frame_length = 400;
float frame_shift = 160;
float num_new_samples =
- ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
+ ceil((num_samples - frame_length) / frame_shift) * frame_shift + frame_length;
end = start + num_new_samples;
len = (int)num_new_samples;
@@ -111,62 +196,95 @@
void Audio::disp()
{
- printf("Audio time is %f s. len is %d\n", (float)speech_len / 16000,
+ printf("Audio time is %f s. len is %d\n", (float)speech_len / model_sample_rate,
speech_len);
}
float Audio::get_time_len()
{
- return (float)speech_len / 16000;
- //speech_len);
+ return (float)speech_len / model_sample_rate;
}
-bool Audio::loadwav(const char *filename)
+void Audio::wavResample(int32_t sampling_rate, const float *waveform,
+ int32_t n)
{
+ printf(
+ "Creating a resampler:\n"
+ " in_sample_rate: %d\n"
+ " output_sample_rate: %d\n",
+ sampling_rate, static_cast<int32_t>(model_sample_rate));
+ float min_freq =
+ std::min<int32_t>(sampling_rate, model_sample_rate);
+ float lowpass_cutoff = 0.99 * 0.5 * min_freq;
+ int32_t lowpass_filter_width = 6;
+ //FIXME
+ //auto resampler = new LinearResample(
+ // sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width);
+ auto resampler = std::make_unique<LinearResample>(
+ sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width);
+ std::vector<float> samples;
+ resampler->Resample(waveform, n, true, &samples);
+ //reset speech_data
+ speech_len = samples.size();
+ if (speech_data != NULL) {
+ free(speech_data);
+ }
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
+ copy(samples.begin(), samples.end(), speech_data);
+}
+
+bool Audio::loadwav(const char *filename, int32_t* sampling_rate)
+{
+ WaveHeader header;
if (speech_data != NULL) {
free(speech_data);
}
if (speech_buff != NULL) {
free(speech_buff);
}
-
+
offset = 0;
-
- FILE *fp;
- fp = fopen(filename, "rb");
- if (fp == nullptr)
+ std::ifstream is(filename, std::ifstream::binary);
+ is.read(reinterpret_cast<char *>(&header), sizeof(header));
+ if(!is){
+ fprintf(stderr, "Failed to read %s\n", filename);
return false;
- fseek(fp, 0, SEEK_END); /*瀹氫綅鍒版枃浠舵湯灏�*/
- uint32_t nFileLen = ftell(fp); /*寰楀埌鏂囦欢澶у皬*/
- fseek(fp, 44, SEEK_SET); /*璺宠繃wav鏂囦欢澶�*/
-
- speech_len = (nFileLen - 44) / 2;
- speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
- speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);
+ }
+
+ *sampling_rate = header.sample_rate;
+ // header.subchunk2_size contains the number of bytes in the data.
+ // As we assume each sample contains two bytes, so it is divided by 2 here
+ speech_len = header.subchunk2_size / 2;
+ speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
{
- memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
- int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp);
- fclose(fp);
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
+ is.read(reinterpret_cast<char *>(speech_buff), header.subchunk2_size);
+ if (!is) {
+ fprintf(stderr, "Failed to read %s\n", filename);
+ return false;
+ }
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
- speech_data = (float*)malloc(sizeof(float) * speech_align_len);
- memset(speech_data, 0, sizeof(float) * speech_align_len);
- int i;
float scale = 1;
-
if (data_type == 1) {
scale = 32768;
}
-
- for (i = 0; i < speech_len; i++) {
+ for (int32_t i = 0; i != speech_len; ++i) {
speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ //resample
+ if(*sampling_rate != model_sample_rate){
+ wavResample(*sampling_rate, speech_data, speech_len);
}
AudioFrame* frame = new AudioFrame(speech_len);
frame_queue.push(frame);
-
return true;
}
@@ -174,57 +292,54 @@
return false;
}
-
-bool Audio::loadwav(const char* buf, int nFileLen)
+bool Audio::loadwav(const char* buf, int nFileLen, int32_t* sampling_rate)
{
-
-
-
+ WaveHeader header;
if (speech_data != NULL) {
free(speech_data);
}
if (speech_buff != NULL) {
free(speech_buff);
}
-
offset = 0;
- size_t nOffset = 0;
+ std::memcpy(&header, buf, sizeof(header));
-#define WAV_HEADER_SIZE 44
-
- speech_len = (nFileLen - WAV_HEADER_SIZE) / 2;
- speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
- speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ *sampling_rate = header.sample_rate;
+ speech_len = header.subchunk2_size / 2;
+ speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
{
- memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
memcpy((void*)speech_buff, (const void*)(buf + WAV_HEADER_SIZE), speech_len * sizeof(int16_t));
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
- speech_data = (float*)malloc(sizeof(float) * speech_align_len);
- memset(speech_data, 0, sizeof(float) * speech_align_len);
- int i;
float scale = 1;
-
if (data_type == 1) {
scale = 32768;
}
- for (i = 0; i < speech_len; i++) {
+ for (int32_t i = 0; i != speech_len; ++i) {
speech_data[i] = (float)speech_buff[i] / scale;
}
+
+ //resample
+ if(*sampling_rate != model_sample_rate){
+ wavResample(*sampling_rate, speech_data, speech_len);
+ }
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
return true;
}
else
return false;
-
}
-
-bool Audio::loadpcmwav(const char* buf, int nBufLen)
+bool Audio::loadpcmwav(const char* buf, int nBufLen, int32_t* sampling_rate)
{
if (speech_data != NULL) {
free(speech_data);
@@ -234,32 +349,28 @@
}
offset = 0;
- size_t nOffset = 0;
-
-
-
speech_len = nBufLen / 2;
- speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
- speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
{
- memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
- speech_data = (float*)malloc(sizeof(float) * speech_align_len);
- memset(speech_data, 0, sizeof(float) * speech_align_len);
-
-
- int i;
float scale = 1;
-
if (data_type == 1) {
scale = 32768;
}
- for (i = 0; i < speech_len; i++) {
+ for (int32_t i = 0; i != speech_len; ++i) {
speech_data[i] = (float)speech_buff[i] / scale;
+ }
+
+ //resample
+ if(*sampling_rate != model_sample_rate){
+ wavResample(*sampling_rate, speech_data, speech_len);
}
AudioFrame* frame = new AudioFrame(speech_len);
@@ -269,13 +380,10 @@
}
else
return false;
-
-
}
-bool Audio::loadpcmwav(const char* filename)
+bool Audio::loadpcmwav(const char* filename, int32_t* sampling_rate)
{
-
if (speech_data != NULL) {
free(speech_data);
}
@@ -293,34 +401,31 @@
fseek(fp, 0, SEEK_SET);
speech_len = (nFileLen) / 2;
- speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
- speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len);
+ speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
if (speech_buff)
{
- memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
+ memset(speech_buff, 0, sizeof(int16_t) * speech_len);
int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp);
fclose(fp);
- speech_data = (float*)malloc(sizeof(float) * speech_align_len);
- memset(speech_data, 0, sizeof(float) * speech_align_len);
+ speech_data = (float*)malloc(sizeof(float) * speech_len);
+ memset(speech_data, 0, sizeof(float) * speech_len);
-
-
- int i;
float scale = 1;
-
if (data_type == 1) {
scale = 32768;
}
-
- for (i = 0; i < speech_len; i++) {
+ for (int32_t i = 0; i != speech_len; ++i) {
speech_data[i] = (float)speech_buff[i] / scale;
}
+ //resample
+ if(*sampling_rate != model_sample_rate){
+ wavResample(*sampling_rate, speech_data, speech_len);
+ }
AudioFrame* frame = new AudioFrame(speech_len);
frame_queue.push(frame);
-
return true;
}
@@ -328,7 +433,6 @@
return false;
}
-
int Audio::fetch_chunck(float *&dout, int len)
{
diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt
index c07aac5..d41fcd0 100644
--- a/funasr/runtime/onnxruntime/src/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt
@@ -1,5 +1,6 @@
file(GLOB files1 "*.cpp")
+file(GLOB files2 "*.cc")
file(GLOB files4 "paraformer/*.cpp")
set(files ${files1} ${files2} ${files3} ${files4})
diff --git a/funasr/runtime/onnxruntime/src/Vocab.cpp b/funasr/runtime/onnxruntime/src/Vocab.cpp
index af6312b..b54a6c6 100644
--- a/funasr/runtime/onnxruntime/src/Vocab.cpp
+++ b/funasr/runtime/onnxruntime/src/Vocab.cpp
@@ -13,21 +13,6 @@
{
ifstream in(filename);
loadVocabFromYaml(filename);
-
- /*
- string line;
- if (in) // 鏈夎鏂囦欢
- {
- while (getline(in, line)) // line涓笉鍖呮嫭姣忚鐨勬崲琛岀
- {
- vocab.push_back(line);
- }
- }
- else{
- printf("Cannot load vocab from: %s, there must be file vocab.txt", filename);
- exit(-1);
- }
- */
}
Vocab::~Vocab()
{
diff --git a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp b/funasr/runtime/onnxruntime/src/libfunasrapi.cpp
index 0d77d20..a2ecf10 100644
--- a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp
+++ b/funasr/runtime/onnxruntime/src/libfunasrapi.cpp
@@ -17,8 +17,9 @@
if (!pRecogObj)
return nullptr;
+ int32_t sampling_rate = -1;
Audio audio(1);
- if (!audio.loadwav(szBuf, nLen))
+ if (!audio.loadwav(szBuf, nLen, &sampling_rate))
return nullptr;
//audio.split();
@@ -41,14 +42,14 @@
return pResult;
}
- _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback)
+ _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback)
{
Model* pRecogObj = (Model*)handle;
if (!pRecogObj)
return nullptr;
Audio audio(1);
- if (!audio.loadpcmwav(szBuf, nLen))
+ if (!audio.loadpcmwav(szBuf, nLen, &sampling_rate))
return nullptr;
//audio.split();
@@ -71,14 +72,14 @@
return pResult;
}
- _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback)
+ _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback)
{
Model* pRecogObj = (Model*)handle;
if (!pRecogObj)
return nullptr;
Audio audio(1);
- if (!audio.loadpcmwav(szFileName))
+ if (!audio.loadpcmwav(szFileName, &sampling_rate))
return nullptr;
//audio.split();
@@ -106,9 +107,10 @@
Model* pRecogObj = (Model*)handle;
if (!pRecogObj)
return nullptr;
-
+
+ int32_t sampling_rate = -1;
Audio audio(1);
- if(!audio.loadwav(szWavfile))
+ if(!audio.loadwav(szWavfile, &sampling_rate))
return nullptr;
//audio.split();
diff --git a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp
index 678cdf6..0d9c658 100644
--- a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp
+++ b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp
@@ -70,7 +70,6 @@
void ModelImp::reset()
{
- printf("Not Imp!!!!!!\n");
}
void ModelImp::apply_lfr(Tensor<float>*& din)
diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h
index 678a3e4..3aeed14 100644
--- a/funasr/runtime/onnxruntime/src/precomp.h
+++ b/funasr/runtime/onnxruntime/src/precomp.h
@@ -44,6 +44,7 @@
#include "FeatureQueue.h"
#include "SpeechWrap.h"
#include <Audio.h>
+#include "resample.h"
#include "Model.h"
#include "paraformer_onnx.h"
#include "libfunasrapi.h"
diff --git a/funasr/runtime/onnxruntime/src/resample.cc b/funasr/runtime/onnxruntime/src/resample.cc
new file mode 100644
index 0000000..0238752
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/resample.cc
@@ -0,0 +1,305 @@
+/**
+ * Copyright 2013 Pegah Ghahremani
+ * 2014 IMSL, PKU-HKUST (author: Wei Shi)
+ * 2014 Yanqing Sun, Junjie Wang
+ * 2014 Johns Hopkins University (author: Daniel Povey)
+ * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// this file is copied and modified from
+// kaldi/src/feat/resample.cc
+
+#include "resample.h"
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include <cstdlib>
+#include <type_traits>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795
+#endif
+
+template <class I>
+I Gcd(I m, I n) {
+ // this function is copied from kaldi/src/base/kaldi-math.h
+ if (m == 0 || n == 0) {
+ if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors.
+ fprintf(stderr, "Undefined GCD since m = 0, n = 0.\n");
+ exit(-1);
+ }
+ return (m == 0 ? (n > 0 ? n : -n) : (m > 0 ? m : -m));
+ // return absolute value of whichever is nonzero
+ }
+ // could use compile-time assertion
+ // but involves messing with complex template stuff.
+ static_assert(std::is_integral<I>::value, "");
+ while (1) {
+ m %= n;
+ if (m == 0) return (n > 0 ? n : -n);
+ n %= m;
+ if (n == 0) return (m > 0 ? m : -m);
+ }
+}
+
+/// Returns the least common multiple of two integers. Will
+/// crash unless the inputs are positive.
+template <class I>
+I Lcm(I m, I n) {
+ // This function is copied from kaldi/src/base/kaldi-math.h
+ assert(m > 0 && n > 0);
+ I gcd = Gcd(m, n);
+ return gcd * (m / gcd) * (n / gcd);
+}
+
+static float DotProduct(const float *a, const float *b, int32_t n) {
+ float sum = 0;
+ for (int32_t i = 0; i != n; ++i) {
+ sum += a[i] * b[i];
+ }
+ return sum;
+}
+
+LinearResample::LinearResample(int32_t samp_rate_in_hz,
+ int32_t samp_rate_out_hz, float filter_cutoff_hz,
+ int32_t num_zeros)
+ : samp_rate_in_(samp_rate_in_hz),
+ samp_rate_out_(samp_rate_out_hz),
+ filter_cutoff_(filter_cutoff_hz),
+ num_zeros_(num_zeros) {
+ assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 &&
+ filter_cutoff_hz > 0.0 && filter_cutoff_hz * 2 <= samp_rate_in_hz &&
+ filter_cutoff_hz * 2 <= samp_rate_out_hz && num_zeros > 0);
+
+ // base_freq is the frequency of the repeating unit, which is the gcd
+ // of the input frequencies.
+ int32_t base_freq = Gcd(samp_rate_in_, samp_rate_out_);
+ input_samples_in_unit_ = samp_rate_in_ / base_freq;
+ output_samples_in_unit_ = samp_rate_out_ / base_freq;
+
+ SetIndexesAndWeights();
+ Reset();
+}
+
+void LinearResample::SetIndexesAndWeights() {
+ first_index_.resize(output_samples_in_unit_);
+ weights_.resize(output_samples_in_unit_);
+
+ double window_width = num_zeros_ / (2.0 * filter_cutoff_);
+
+ for (int32_t i = 0; i < output_samples_in_unit_; i++) {
+ double output_t = i / static_cast<double>(samp_rate_out_);
+ double min_t = output_t - window_width, max_t = output_t + window_width;
+ // we do ceil on the min and floor on the max, because if we did it
+ // the other way around we would unnecessarily include indexes just
+ // outside the window, with zero coefficients. It's possible
+ // if the arguments to the ceil and floor expressions are integers
+ // (e.g. if filter_cutoff_ has an exact ratio with the sample rates),
+ // that we unnecessarily include something with a zero coefficient,
+ // but this is only a slight efficiency issue.
+ int32_t min_input_index = ceil(min_t * samp_rate_in_),
+ max_input_index = floor(max_t * samp_rate_in_),
+ num_indices = max_input_index - min_input_index + 1;
+ first_index_[i] = min_input_index;
+ weights_[i].resize(num_indices);
+ for (int32_t j = 0; j < num_indices; j++) {
+ int32_t input_index = min_input_index + j;
+ double input_t = input_index / static_cast<double>(samp_rate_in_),
+ delta_t = input_t - output_t;
+ // sign of delta_t doesn't matter.
+ weights_[i][j] = FilterFunc(delta_t) / samp_rate_in_;
+ }
+ }
+}
+
+/** Here, t is a time in seconds representing an offset from
+ the center of the windowed filter function, and FilterFunction(t)
+ returns the windowed filter function, described
+ in the header as h(t) = f(t)g(t), evaluated at t.
+*/
+float LinearResample::FilterFunc(float t) const {
+ float window, // raised-cosine (Hanning) window of width
+ // num_zeros_/2*filter_cutoff_
+ filter; // sinc filter function
+ if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_))
+ window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t));
+ else
+ window = 0.0; // outside support of window function
+ if (t != 0)
+ filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t);
+ else
+ filter = 2 * filter_cutoff_; // limit of the function at t = 0
+ return filter * window;
+}
+
+void LinearResample::Reset() {
+ input_sample_offset_ = 0;
+ output_sample_offset_ = 0;
+ input_remainder_.resize(0);
+}
+
+void LinearResample::Resample(const float *input, int32_t input_dim, bool flush,
+ std::vector<float> *output) {
+ int64_t tot_input_samp = input_sample_offset_ + input_dim,
+ tot_output_samp = GetNumOutputSamples(tot_input_samp, flush);
+
+ assert(tot_output_samp >= output_sample_offset_);
+
+ output->resize(tot_output_samp - output_sample_offset_);
+
+ // samp_out is the index into the total output signal, not just the part
+ // of it we are producing here.
+ for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp;
+ samp_out++) {
+ int64_t first_samp_in;
+ int32_t samp_out_wrapped;
+ GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped);
+ const std::vector<float> &weights = weights_[samp_out_wrapped];
+ // first_input_index is the first index into "input" that we have a weight
+ // for.
+ int32_t first_input_index =
+ static_cast<int32_t>(first_samp_in - input_sample_offset_);
+ float this_output;
+ if (first_input_index >= 0 &&
+ first_input_index + static_cast<int32_t>(weights.size()) <= input_dim) {
+ this_output =
+ DotProduct(input + first_input_index, weights.data(), weights.size());
+ } else { // Handle edge cases.
+ this_output = 0.0;
+ for (int32_t i = 0; i < static_cast<int32_t>(weights.size()); i++) {
+ float weight = weights[i];
+ int32_t input_index = first_input_index + i;
+ if (input_index < 0 &&
+ static_cast<int32_t>(input_remainder_.size()) + input_index >= 0) {
+ this_output +=
+ weight * input_remainder_[input_remainder_.size() + input_index];
+ } else if (input_index >= 0 && input_index < input_dim) {
+ this_output += weight * input[input_index];
+ } else if (input_index >= input_dim) {
+ // We're past the end of the input and are adding zero; should only
+ // happen if the user specified flush == true, or else we would not
+ // be trying to output this sample.
+ assert(flush);
+ }
+ }
+ }
+ int32_t output_index =
+ static_cast<int32_t>(samp_out - output_sample_offset_);
+ (*output)[output_index] = this_output;
+ }
+
+ if (flush) {
+ Reset(); // Reset the internal state.
+ } else {
+ SetRemainder(input, input_dim);
+ input_sample_offset_ = tot_input_samp;
+ output_sample_offset_ = tot_output_samp;
+ }
+}
+
+int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp,
+ bool flush) const {
+ // For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
+ // where tick_freq is the least common multiple of samp_rate_in_ and
+ // samp_rate_out_.
+ int32_t tick_freq = Lcm(samp_rate_in_, samp_rate_out_);
+ int32_t ticks_per_input_period = tick_freq / samp_rate_in_;
+
+ // work out the number of ticks in the time interval
+ // [ 0, input_num_samp/samp_rate_in_ ).
+ int64_t interval_length_in_ticks = input_num_samp * ticks_per_input_period;
+ if (!flush) {
+ float window_width = num_zeros_ / (2.0 * filter_cutoff_);
+ // To count the window-width in ticks we take the floor. This
+ // is because since we're looking for the largest integer num-out-samp
+ // that fits in the interval, which is open on the right, a reduction
+ // in interval length of less than a tick will never make a difference.
+ // For example, the largest integer in the interval [ 0, 2 ) and the
+ // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one).
+ // So when we're subtracting the window-width we can ignore the fractional
+ // part.
+ int32_t window_width_ticks = floor(window_width * tick_freq);
+ // The time-period of the output that we can sample gets reduced
+ // by the window-width (which is actually the distance from the
+ // center to the edge of the windowing function) if we're not
+ // "flushing the output".
+ interval_length_in_ticks -= window_width_ticks;
+ }
+ if (interval_length_in_ticks <= 0) return 0;
+
+ int32_t ticks_per_output_period = tick_freq / samp_rate_out_;
+ // Get the last output-sample in the closed interval, i.e. replacing [ ) with
+ // [ ]. Note: integer division rounds down. See
+ // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of
+ // the notation.
+ int64_t last_output_samp = interval_length_in_ticks / ticks_per_output_period;
+ // We need the last output-sample in the open interval, so if it takes us to
+ // the end of the interval exactly, subtract one.
+ if (last_output_samp * ticks_per_output_period == interval_length_in_ticks)
+ last_output_samp--;
+
+ // First output-sample index is zero, so the number of output samples
+ // is the last output-sample plus one.
+ int64_t num_output_samp = last_output_samp + 1;
+ return num_output_samp;
+}
+
+// inline
+void LinearResample::GetIndexes(int64_t samp_out, int64_t *first_samp_in,
+ int32_t *samp_out_wrapped) const {
+ // A unit is the smallest nonzero amount of time that is an exact
+ // multiple of the input and output sample periods. The unit index
+ // is the answer to "which numbered unit we are in".
+ int64_t unit_index = samp_out / output_samples_in_unit_;
+ // samp_out_wrapped is equal to samp_out % output_samples_in_unit_
+ *samp_out_wrapped =
+ static_cast<int32_t>(samp_out - unit_index * output_samples_in_unit_);
+ *first_samp_in =
+ first_index_[*samp_out_wrapped] + unit_index * input_samples_in_unit_;
+}
+
+void LinearResample::SetRemainder(const float *input, int32_t input_dim) {
+ std::vector<float> old_remainder(input_remainder_);
+ // max_remainder_needed is the width of the filter from side to side,
+ // measured in input samples. you might think it should be half that,
+ // but you have to consider that you might be wanting to output samples
+ // that are "in the past" relative to the beginning of the latest
+ // input... anyway, storing more remainder than needed is not harmful.
+ int32_t max_remainder_needed =
+ ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_);
+ input_remainder_.resize(max_remainder_needed);
+ for (int32_t index = -static_cast<int32_t>(input_remainder_.size());
+ index < 0; index++) {
+ // we interpret "index" as an offset from the end of "input" and
+ // from the end of input_remainder_.
+ int32_t input_index = index + input_dim;
+ if (input_index >= 0) {
+ input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] =
+ input[input_index];
+ } else if (input_index + static_cast<int32_t>(old_remainder.size()) >= 0) {
+ input_remainder_[index + static_cast<int32_t>(input_remainder_.size())] =
+ old_remainder[input_index +
+ static_cast<int32_t>(old_remainder.size())];
+ // else leave it at zero.
+ }
+ }
+}
diff --git a/funasr/runtime/onnxruntime/src/resample.h b/funasr/runtime/onnxruntime/src/resample.h
new file mode 100644
index 0000000..b9a283a
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/resample.h
@@ -0,0 +1,137 @@
+/**
+ * Copyright 2013 Pegah Ghahremani
+ * 2014 IMSL, PKU-HKUST (author: Wei Shi)
+ * 2014 Yanqing Sun, Junjie Wang
+ * 2014 Johns Hopkins University (author: Daniel Povey)
+ * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// this file is copied and modified from
+// kaldi/src/feat/resample.h
+
+#include <cstdint>
+#include <vector>
+
+
+/*
+ We require that the input and output sampling rate be specified as
+ integers, as this is an easy way to specify that their ratio be rational.
+*/
+
+class LinearResample {
+ public:
+ /// Constructor. We make the input and output sample rates integers, because
+ /// we are going to need to find a common divisor. This should just remind
+ /// you that they need to be integers. The filter cutoff needs to be less
+ /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros
+ /// controls the sharpness of the filter, more == sharper but less efficient.
+ /// We suggest around 4 to 10 for normal use.
+ LinearResample(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz,
+ float filter_cutoff_hz, int32_t num_zeros);
+
+ /// Calling the function Reset() resets the state of the object prior to
+ /// processing a new signal; it is only necessary if you have called
+ /// Resample(x, x_size, false, y) for some signal, leading to a remainder of
+ /// the signal being called, but then abandon processing the signal before
+ /// calling Resample(x, x_size, true, y) for the last piece. Call it
+ /// unnecessarily between signals will not do any harm.
+ void Reset();
+
+ /// This function does the resampling. If you call it with flush == true and
+ /// you have never called it with flush == false, it just resamples the input
+ /// signal (it resizes the output to a suitable number of samples).
+ ///
+ /// You can also use this function to process a signal a piece at a time.
+ /// suppose you break it into piece1, piece2, ... pieceN. You can call
+ /// \code{.cc}
+ /// Resample(piece1, piece1_size, false, &output1);
+ /// Resample(piece2, piece2_size, false, &output2);
+ /// Resample(piece3, piece3_size, true, &output3);
+ /// \endcode
+ /// If you call it with flush == false, it won't output the last few samples
+ /// but will remember them, so that if you later give it a second piece of
+ /// the input signal it can process it correctly.
+ /// If your most recent call to the object was with flush == false, it will
+ /// have internal state; you can remove this by calling Reset().
+ /// Empty input is acceptable.
+ void Resample(const float *input, int32_t input_dim, bool flush,
+ std::vector<float> *output);
+
+ //// Return the input and output sampling rates (for checks, for example)
+ int32_t GetInputSamplingRate() const { return samp_rate_in_; }
+ int32_t GetOutputSamplingRate() const { return samp_rate_out_; }
+
+ private:
+ void SetIndexesAndWeights();
+
+ float FilterFunc(float) const;
+
+ /// This function outputs the number of output samples we will output
+ /// for a signal with "input_num_samp" input samples. If flush == true,
+ /// we return the largest n such that
+ /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ),
+ /// and note that the interval is half-open. If flush == false,
+ /// define window_width as num_zeros / (2.0 * filter_cutoff_);
+ /// we return the largest n such that (n/samp_rate_out_) is in the interval
+ /// [ 0, input_num_samp/samp_rate_in_ - window_width ).
+ int64_t GetNumOutputSamples(int64_t input_num_samp, bool flush) const;
+
+ /// Given an output-sample index, this function outputs to *first_samp_in the
+ /// first input-sample index that we have a weight on (may be negative),
+ /// and to *samp_out_wrapped the index into weights_ where we can get the
+ /// corresponding weights on the input.
+ inline void GetIndexes(int64_t samp_out, int64_t *first_samp_in,
+ int32_t *samp_out_wrapped) const;
+
+ void SetRemainder(const float *input, int32_t input_dim);
+
+ private:
+ // The following variables are provided by the user.
+ int32_t samp_rate_in_;
+ int32_t samp_rate_out_;
+ float filter_cutoff_;
+ int32_t num_zeros_;
+
+ int32_t input_samples_in_unit_; ///< The number of input samples in the
+ ///< smallest repeating unit: num_samp_in_ =
+ ///< samp_rate_in_hz / Gcd(samp_rate_in_hz,
+ ///< samp_rate_out_hz)
+
+ int32_t output_samples_in_unit_; ///< The number of output samples in the
+ ///< smallest repeating unit: num_samp_out_
+ ///< = samp_rate_out_hz /
+ ///< Gcd(samp_rate_in_hz, samp_rate_out_hz)
+
+ /// The first input-sample index that we sum over, for this output-sample
+ /// index. May be negative; any truncation at the beginning is handled
+ /// separately. This is just for the first few output samples, but we can
+ /// extrapolate the correct input-sample index for arbitrary output samples.
+ std::vector<int32_t> first_index_;
+
+ /// Weights on the input samples, for this output-sample index.
+ std::vector<std::vector<float>> weights_;
+
+ // the following variables keep track of where we are in a particular signal,
+ // if it is being provided over multiple calls to Resample().
+
+ int64_t input_sample_offset_; ///< The number of input samples we have
+ ///< already received for this signal
+ ///< (including anything in remainder_)
+ int64_t output_sample_offset_; ///< The number of samples we have already
+ ///< output for this signal.
+ std::vector<float> input_remainder_; ///< A small trailing part of the
+ ///< previously seen input signal.
+};
--
Gitblit v1.9.1