python/FunASR-XL.git

			@@ -193,18 +193,28 @@
			return 0;
			}

			Audio::Audio(int data_type) : data_type(data_type)
			Audio::Audio(int data_type) : dest_sample_rate(MODEL_SAMPLE_RATE), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			align_size = 1360;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::Audio(int data_type, int size) : data_type(data_type)
			Audio::Audio(int model_sample_rate, int data_type) : dest_sample_rate(model_sample_rate), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			align_size = 1360;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::Audio(int model_sample_rate, int data_type, int size) : dest_sample_rate(model_sample_rate), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			align_size = (float)size;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::~Audio()
			@@ -222,12 +232,12 @@

			void Audio::Disp()
			{
			LOG(INFO) << "Audio time is " << (float)speech_len / MODEL_SAMPLE_RATE << " s. len is " << speech_len;
			LOG(INFO) << "Audio time is " << (float)speech_len / dest_sample_rate << " s. len is " << speech_len;
			}

			float Audio::GetTimeLen()
			{
			return (float)speech_len / MODEL_SAMPLE_RATE;
			return (float)speech_len / dest_sample_rate;
			}

			void Audio::WavResample(int32_t sampling_rate, const float *waveform,
			@@ -235,15 +245,15 @@
			{
			LOG(INFO) << "Creating a resampler:\n"
			<< " in_sample_rate: "<< sampling_rate << "\n"
			<< " output_sample_rate: " << static_cast<int32_t>(MODEL_SAMPLE_RATE);
			<< " output_sample_rate: " << static_cast<int32_t>(dest_sample_rate);
			float min_freq =
			std::min<int32_t>(sampling_rate, MODEL_SAMPLE_RATE);
			std::min<int32_t>(sampling_rate, dest_sample_rate);
			float lowpass_cutoff = 0.99 * 0.5 * min_freq;

			int32_t lowpass_filter_width = 6;

			auto resampler = std::make_unique<LinearResample>(
			sampling_rate, MODEL_SAMPLE_RATE, lowpass_cutoff, lowpass_filter_width);
			sampling_rate, dest_sample_rate, lowpass_cutoff, lowpass_filter_width);
			std::vector<float> samples;
			resampler->Resample(waveform, n, true, &samples);
			//reset speech_data
			@@ -311,7 +321,7 @@
			nullptr, // allocate a new context
			AV_CH_LAYOUT_MONO, // output channel layout (stereo)
			AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
			16000, // output sample rate (same as input)
			dest_sample_rate, // output sample rate (same as input)
			av_get_default_channel_layout(codecContext->channels), // input channel layout
			codecContext->sample_fmt, // input sample format
			codecContext->sample_rate, // input sample rate
			@@ -344,30 +354,28 @@
			while (avcodec_receive_frame(codecContext, frame) >= 0) {
			// Resample audio if necessary
			std::vector<uint8_t> resampled_buffer;
			int in_samples = frame->nb_samples;
			uint8_t **in_data = frame->extended_data;
			int out_samples = av_rescale_rnd(in_samples,
			16000,
			int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, codecContext->sample_rate) + frame->nb_samples,
			dest_sample_rate,
			codecContext->sample_rate,
			AV_ROUND_DOWN);

			int resampled_size = out_samples * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16);
			if (resampled_buffer.size() < resampled_size) {
			resampled_buffer.resize(resampled_size);
			}
			}
			uint8_t *resampled_data = resampled_buffer.data();
			int ret = swr_convert(
			swr_ctx,
			&resampled_data, // output buffer
			resampled_size, // output buffer size
			(const uint8_t )(frame->data), //(const uint8_t )(frame->extended_data)
			in_samples // input buffer size
			out_samples, // output buffer size
			(const uint8_t **)(frame->data), // choose channel
			frame->nb_samples // input buffer size
			);
			if (ret < 0) {
			LOG(ERROR) << "Error resampling audio";
			break;
			}
			std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
			resampled_buffers.insert(resampled_buffers.end(), resampled_buffer.begin(), resampled_buffer.begin() + resampled_size);
			}
			}
			}
			@@ -494,7 +502,7 @@
			nullptr, // allocate a new context
			AV_CH_LAYOUT_MONO, // output channel layout (stereo)
			AV_SAMPLE_FMT_S16, // output sample format (signed 16-bit)
			16000, // output sample rate (same as input)
			dest_sample_rate, // output sample rate (same as input)
			av_get_default_channel_layout(codecContext->channels), // input channel layout
			codecContext->sample_fmt, // input sample format
			codecContext->sample_rate, // input sample rate
			@@ -529,30 +537,28 @@
			while (avcodec_receive_frame(codecContext, frame) >= 0) {
			// Resample audio if necessary
			std::vector<uint8_t> resampled_buffer;
			int in_samples = frame->nb_samples;
			uint8_t **in_data = frame->extended_data;
			int out_samples = av_rescale_rnd(in_samples,
			16000,
			int out_samples = av_rescale_rnd(swr_get_delay(swr_ctx, codecContext->sample_rate) + frame->nb_samples,
			dest_sample_rate,
			codecContext->sample_rate,
			AV_ROUND_DOWN);

			int resampled_size = out_samples * av_get_bytes_per_sample(AV_SAMPLE_FMT_S16);
			if (resampled_buffer.size() < resampled_size) {
			resampled_buffer.resize(resampled_size);
			}
			}
			uint8_t *resampled_data = resampled_buffer.data();
			int ret = swr_convert(
			swr_ctx,
			&resampled_data, // output buffer
			resampled_size, // output buffer size
			(const uint8_t )(frame->data), //(const uint8_t )(frame->extended_data)
			in_samples // input buffer size
			out_samples, // output buffer size
			(const uint8_t **)(frame->data), // choose channel: channel_data
			frame->nb_samples // input buffer size
			);
			if (ret < 0) {
			LOG(ERROR) << "Error resampling audio";
			break;
			}
			std::copy(resampled_buffer.begin(), resampled_buffer.end(), std::back_inserter(resampled_buffers));
			resampled_buffers.insert(resampled_buffers.end(), resampled_buffer.begin(), resampled_buffer.begin() + resampled_size);
			}
			}
			}
			@@ -604,7 +610,7 @@
			}


			bool Audio::LoadWav(const char filename, int32_t sampling_rate)
			bool Audio::LoadWav(const char filename, int32_t sampling_rate, bool resample)
			{
			WaveHeader header;
			if (speech_data != NULL) {
			@@ -666,7 +672,7 @@
			}

			//resample
			if(*sampling_rate != MODEL_SAMPLE_RATE){
			if(resample && *sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			}

			@@ -752,7 +758,7 @@
			}

			//resample
			if(*sampling_rate != MODEL_SAMPLE_RATE){
			if(*sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			}

			@@ -795,7 +801,7 @@
			}

			//resample
			if(*sampling_rate != MODEL_SAMPLE_RATE){
			if(*sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			}

			@@ -840,7 +846,7 @@
			}

			//resample
			if(*sampling_rate != MODEL_SAMPLE_RATE){
			if(*sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			}

			@@ -857,7 +863,7 @@
			return false;
			}

			bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate)
			bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate, bool resample)
			{
			if (speech_data != NULL) {
			free(speech_data);
			@@ -898,7 +904,7 @@
			}

			//resample
			if(*sampling_rate != MODEL_SAMPLE_RATE){
			if(resample && *sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			}

			@@ -1009,7 +1015,7 @@
			AudioFrame *frame = frame_queue.front();
			frame_queue.pop();

			start_time = (float)(frame->GetStart())/MODEL_SAMPLE_RATE;
			start_time = (float)(frame->GetStart())/ dest_sample_rate;
			dout = speech_data + frame->GetStart();
			len = frame->GetLen();
			delete frame;
			@@ -1193,7 +1199,7 @@
			}

			}else if(speech_end_i != -1){ // [-1,100]
			if(speech_start == -1 or speech_offline_start == -1){
			if(speech_start == -1 \|\| speech_offline_start == -1){
			LOG(ERROR) <<"Vad start is null while vad end is available. Set vad start 0" ;
			speech_start = 0;
			}
			@@ -1248,7 +1254,7 @@
			}

			// erase all_samples
			int vector_cache = MODEL_SAMPLE_RATE*2;
			int vector_cache = dest_sample_rate*2;
			if(speech_offline_start == -1){
			if(all_samples.size() > vector_cache){
			int erase_num = all_samples.size() - vector_cache;