Merge pull request #1249 from alibaba-damo-academy/main
code sync
| | |
| | | | paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) | speech recognition with speaker diarization, with timestamps, non-streaming | 60000 hours, Mandarin | 220M | |
| | | | <nobr>paraformer-zh-online <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )</nobr> | speech recognition, streaming | 60000 hours, Mandarin | 220M | |
| | | | paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) | speech recognition, with timestamps, non-streaming | 50000 hours, English | 220M | |
| | | | paraformer-en-spk <br> ([⭐]()[🤗]() ) | speech recognition with speaker diarization, non-streaming | Undo | Undo | |
| | | | conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) | speech recognition, non-streaming | 50000 hours, English | 220M | |
| | | | ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) | punctuation restoration | 100M, Mandarin and English | 1.1G | |
| | | | fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) | voice activity detection | 5000 hours, Mandarin and English | 0.4M | |
| | |
| | | punc_model="ct-punc-c", punc_model_revision="v2.0.2", \ |
| | | spk_model="cam++", spk_model_revision="v2.0.2") |
| | | res = model(input=f"{model.model_path}/example/asr_example.wav", |
| | | batch_size=16, |
| | | batch_size=64, |
| | | hotword='魔搭') |
| | | print(res) |
| | | ``` |
| | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="fsmn-vad", model_revision="v2.0.2") |
| | | |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | res = model(input=wav_file) |
| | | print(res) |
| | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="ct-punc", model_revision="v2.0.2") |
| | | |
| | | res = model(input="那今天的会就到这里吧 happy new year 明年见") |
| | | print(res) |
| | | ``` |
| | |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="fa-zh", model_revision="v2.0.2") |
| | | |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/text.txt" |
| | | res = model(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | print(res) |
| | | ``` |
| | |
| | | | 模型名字 | 任务详情 | 训练数据 | 参数量 | |
| | | |:------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:| |
| | | | paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) [🤗]() ) | 语音识别,带时间戳输出,非实时 | 60000小时,中文 | 220M | |
| | | | paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) | 分角色语音识别,带时间戳输出,非实时 | 60000小时,中文 | 220M | |
| | | | paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() ) | 语音识别,实时 | 60000小时,中文 | 220M | |
| | | | paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) | 语音识别,非实时 | 50000小时,英文 | 220M | |
| | | | paraformer-en-spk <br> ([⭐]() [🤗]() ) | 语音识别,非实时 | 50000小时,英文 | 220M | |
| | | | conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) | 语音识别,非实时 | 50000小时,英文 | 220M | |
| | | | ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) | 标点恢复 | 100M,中文与英文 | 1.1G | |
| | | | fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) | 语音端点检测,实时 | 5000小时,中文与英文 | 0.4M | |
| | | | fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() ) | 字级别时间戳预测 | 50000小时,中文 | 38M | |
| | | | paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) | 分角色语音识别,带时间戳输出,非实时 | 60000小时,中文 | 220M | |
| | | | paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() ) | 语音识别,实时 | 60000小时,中文 | 220M | |
| | | | paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) | 语音识别,非实时 | 50000小时,英文 | 220M | |
| | | | conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) | 语音识别,非实时 | 50000小时,英文 | 220M | |
| | | | ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) | 标点恢复 | 100M,中文与英文 | 1.1G | |
| | | | fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) | 语音端点检测,实时 | 5000小时,中文与英文 | 0.4M | |
| | | | fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() ) | 字级别时间戳预测 | 50000小时,中文 | 38M | |
| | | |
| | | |
| | | <a name="快速开始"></a> |
| | |
| | | ### 非实时语音识别 |
| | | ```python |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="paraformer-zh") |
| | | # for the long duration wav, you could add vad model |
| | | # model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc") |
| | | |
| | | res = model(input="asr_example_zh.wav", batch_size=64) |
| | | # paraformer-zh is a multi-functional asr model |
| | | # use vad, punc, spk or not as you need |
| | | model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \ |
| | | vad_model="fsmn-vad", vad_model_revision="v2.0.2", \ |
| | | punc_model="ct-punc-c", punc_model_revision="v2.0.2", \ |
| | | spk_model="cam++", spk_model_revision="v2.0.2") |
| | | res = model(input=f"{model.model_path}/example/asr_example.wav", |
| | | batch_size=64, |
| | | hotword='魔搭') |
| | | print(res) |
| | | ``` |
| | | 注:`model_hub`:表示模型仓库,`ms`为选择modelscope下载,`hf`为选择huggingface下载。 |
| | |
| | | encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention |
| | | decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention |
| | | |
| | | model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.0") |
| | | model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2") |
| | | |
| | | import soundfile |
| | | import os |
| | |
| | | ```python |
| | | from funasr import AutoModel |
| | | |
| | | model = AutoModel(model="ct-punc", model_revision="v2.0.1") |
| | | model = AutoModel(model="ct-punc", model_revision="v2.0.2") |
| | | |
| | | res = model(input="那今天的会就到这里吧 happy new year 明年见") |
| | | print(res) |
| | |
| | | model = AutoModel(model="fa-zh", model_revision="v2.0.0") |
| | | |
| | | wav_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/asr_example.wav" |
| | | text_file = f"{model.model_path}/example/text.txt" |
| | | res = model(input=(wav_file, text_file), data_type=("sound", "text")) |
| | | print(res) |
| | | ``` |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle, speech_buff+sample_offset, step, punc_cache, is_final, |
| | | sampling_rate_, "pcm", (ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | n_total_time += taking_micros; |
| | |
| | | GetValue(asr_mode, ASR_MODE, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = onnx_thread.getValue(); |
| | | int asr_mode_ = -1; |
| | | if(model_path[ASR_MODE] == "offline"){ |
| | |
| | | am_sc = am_scale.getValue(); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | GetValue(asr_mode, ASR_MODE, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = onnx_thread.getValue(); |
| | | int asr_mode_ = -1; |
| | | if(model_path[ASR_MODE] == "offline"){ |
| | |
| | | // init wfst decoder |
| | | FUNASR_DEC_HANDLE decoder_handle = FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, glob_beam, lat_beam, am_sc); |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle, |
| | | speech_buff+sample_offset, step, punc_cache, is_final, sampling_rate_, "pcm", |
| | | (ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | |
| | |
| | | GetValue(txt_path, TXT_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num); |
| | | |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | |
| | | long taking_micros = 0; |
| | | for(auto& txt_str : txt_list){ |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, NULL); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, nullptr); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | string msg = FunASRGetResult(result, 0); |
| | |
| | | // warm up |
| | | for (size_t i = 0; i < 1; i++) |
| | | { |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle); |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle); |
| | | if(result){ |
| | | FunASRFreeResult(result); |
| | | } |
| | |
| | | break; |
| | | } |
| | | |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle); |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | n_total_time += taking_micros; |
| | |
| | | |
| | | void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path) |
| | | { |
| | | if (value_arg.isSet()){ |
| | | model_path.insert({key, value_arg.getValue()}); |
| | | LOG(INFO)<< key << " : " << value_arg.getValue(); |
| | | } |
| | | model_path.insert({key, value_arg.getValue()}); |
| | | LOG(INFO)<< key << " : " << value_arg.getValue(); |
| | | } |
| | | |
| | | int main(int argc, char *argv[]) |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_HANDLE asr_handle=FunOfflineInit(model_path, 1); |
| | | |
| | | if (!asr_handle) |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num); |
| | | |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | for (int i = 0; i < wav_list.size(); i++) { |
| | | auto& wav_file = wav_list[i]; |
| | | auto& wav_id = wav_ids[i]; |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), NULL, audio_fs.getValue()); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), nullptr, audio_fs.getValue()); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | |
| | |
| | | |
| | | void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path) |
| | | { |
| | | if (value_arg.isSet()){ |
| | | model_path.insert({key, value_arg.getValue()}); |
| | | LOG(INFO)<< key << " : " << value_arg.getValue(); |
| | | } |
| | | model_path.insert({key, value_arg.getValue()}); |
| | | LOG(INFO)<< key << " : " << value_arg.getValue(); |
| | | } |
| | | |
| | | int main(int argc, char** argv) |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE asr_hanlde=FunOfflineInit(model_path, thread_num); |
| | | |
| | |
| | | LOG(INFO) << "hotword path: " << hotword_path; |
| | | funasr::ExtractHws(hotword_path, hws_map, nn_hotwords_); |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | for (int i = 0; i < wav_list.size(); i++) { |
| | | auto& wav_file = wav_list[i]; |
| | | auto& wav_id = wav_ids[i]; |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs.getValue(), true, decoder_handle); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs.getValue(), true, decoder_handle); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE asr_handle=FunASRInit(model_path, thread_num, ASR_ONLINE); |
| | | |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | |
| | |
| | | GetValue(txt_path, TXT_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num, PUNC_ONLINE); |
| | | |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | splitString(vad_strs, txt_str, "|"); |
| | | string str_out; |
| | | FUNASR_RESULT result = nullptr; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | for(auto& vad_str:vad_strs){ |
| | | result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, NULL, PUNC_ONLINE, result); |
| | | result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, nullptr, PUNC_ONLINE, result); |
| | | if(result){ |
| | | string msg = CTTransformerGetResult(result, 0); |
| | | str_out += msg; |
| | | LOG(INFO)<<"Online result: "<<msg; |
| | | } |
| | | } |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO)<<"Results: "<<str_out; |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_); |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_); |
| | | if (result) |
| | | { |
| | | FunASRFreeResult(result); |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | n_total_time += taking_micros; |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_HANDLE asr_handle=FunASRInit(model_path, 1, ASR_ONLINE); |
| | | |
| | | if (!asr_handle) |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | GetValue(wav_path, WAV_PATH, model_path); |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | int thread_num = 1; |
| | | FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num); |
| | | |
| | |
| | | exit(-1); |
| | | } |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | } else { |
| | | is_final = false; |
| | | } |
| | | gettimeofday(&start, NULL); |
| | | FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, NULL, is_final, sampling_rate_); |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, nullptr, is_final, sampling_rate_); |
| | | gettimeofday(&end, nullptr); |
| | | seconds = (end.tv_sec - start.tv_sec); |
| | | taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | |
| | |
| | | void *p1; // original block |
| | | void **p2; // aligned block |
| | | int offset = alignment - 1 + sizeof(void *); |
| | | if ((p1 = (void *)malloc(required_bytes + offset)) == NULL) { |
| | | return NULL; |
| | | if ((p1 = (void *)malloc(required_bytes + offset)) == nullptr) { |
| | | return nullptr; |
| | | } |
| | | p2 = (void **)(((size_t)(p1) + offset) & ~(alignment - 1)); |
| | | p2[-1] = p1; |
| | |
| | | }; |
| | | ~AudioWindow(){ |
| | | free(window); |
| | | window = nullptr; |
| | | }; |
| | | int put(int val) |
| | | { |
| | |
| | | len = end - start; |
| | | } |
| | | AudioFrame::~AudioFrame(){ |
| | | if(data != NULL){ |
| | | if(data != nullptr){ |
| | | free(data); |
| | | data = nullptr; |
| | | } |
| | | } |
| | | int AudioFrame::SetStart(int val) |
| | |
| | | |
| | | Audio::Audio(int data_type) : dest_sample_rate(MODEL_SAMPLE_RATE), data_type(data_type) |
| | | { |
| | | speech_buff = NULL; |
| | | speech_data = NULL; |
| | | speech_buff = nullptr; |
| | | speech_data = nullptr; |
| | | align_size = 1360; |
| | | seg_sample = dest_sample_rate / 1000; |
| | | } |
| | | |
| | | Audio::Audio(int model_sample_rate, int data_type) : dest_sample_rate(model_sample_rate), data_type(data_type) |
| | | { |
| | | speech_buff = NULL; |
| | | speech_data = NULL; |
| | | speech_buff = nullptr; |
| | | speech_data = nullptr; |
| | | align_size = 1360; |
| | | seg_sample = dest_sample_rate / 1000; |
| | | } |
| | | |
| | | Audio::Audio(int model_sample_rate, int data_type, int size) : dest_sample_rate(model_sample_rate), data_type(data_type) |
| | | { |
| | | speech_buff = NULL; |
| | | speech_data = NULL; |
| | | speech_buff = nullptr; |
| | | speech_data = nullptr; |
| | | align_size = (float)size; |
| | | seg_sample = dest_sample_rate / 1000; |
| | | } |
| | | |
| | | Audio::~Audio() |
| | | { |
| | | if (speech_buff != NULL) { |
| | | if (speech_buff != nullptr) { |
| | | free(speech_buff); |
| | | speech_buff = nullptr; |
| | | } |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_char != NULL) { |
| | | if (speech_char != nullptr) { |
| | | free(speech_char); |
| | | speech_char = nullptr; |
| | | } |
| | | ClearQueue(frame_queue); |
| | | ClearQueue(asr_online_queue); |
| | |
| | | resampler->Resample(waveform, n, true, &samples); |
| | | //reset speech_data |
| | | speech_len = samples.size(); |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | |
| | | #else |
| | | // from file |
| | | AVFormatContext* formatContext = avformat_alloc_context(); |
| | | if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) { |
| | | if (avformat_open_input(&formatContext, filename, nullptr, nullptr) != 0) { |
| | | LOG(ERROR) << "Error: Could not open input file."; |
| | | avformat_close_input(&formatContext); |
| | | avformat_free_context(formatContext); |
| | | return false; |
| | | } |
| | | |
| | | if (avformat_find_stream_info(formatContext, NULL) < 0) { |
| | | if (avformat_find_stream_info(formatContext, nullptr) < 0) { |
| | | LOG(ERROR) << "Error: Could not open input file."; |
| | | avformat_close_input(&formatContext); |
| | | avformat_free_context(formatContext); |
| | | return false; |
| | | } |
| | | const AVCodec* codec = NULL; |
| | | AVCodecParameters* codecParameters = NULL; |
| | | const AVCodec* codec = nullptr; |
| | | AVCodecParameters* codecParameters = nullptr; |
| | | int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0); |
| | | if (audioStreamIndex >= 0) { |
| | | codecParameters = formatContext->streams[audioStreamIndex]->codecpar; |
| | |
| | | avcodec_free_context(&codecContext); |
| | | return false; |
| | | } |
| | | if (avcodec_open2(codecContext, codec, NULL) < 0) { |
| | | if (avcodec_open2(codecContext, codec, nullptr) < 0) { |
| | | LOG(ERROR) << "Error: Could not open audio decoder."; |
| | | avformat_close_input(&formatContext); |
| | | avformat_free_context(formatContext); |
| | |
| | | av_packet_free(&packet); |
| | | av_frame_free(&frame); |
| | | |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | if (speech_char != NULL) { |
| | | if (speech_char != nullptr) { |
| | | free(speech_char); |
| | | speech_char = nullptr; |
| | | } |
| | | offset = 0; |
| | | |
| | |
| | | } |
| | | |
| | | speech_len = (resampled_buffers.size()) / 2; |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)resampled_buffers.data(), speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | if(speech_data){ |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | float scale = 1; |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | scale = 32768.0f; |
| | | } |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | for (int32_t i = 0; i < speech_len; ++i) { |
| | | int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) | resampled_buffers[2 * i]); |
| | | speech_data[i] = (float)val / scale; |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | return true; |
| | | } |
| | | else |
| | | }else{ |
| | | return false; |
| | | } |
| | | |
| | | #endif |
| | | } |
| | | |
| | |
| | | } |
| | | AVFormatContext* formatContext = avformat_alloc_context(); |
| | | formatContext->pb = avio_ctx; |
| | | if (avformat_open_input(&formatContext, "", NULL, NULL) != 0) { |
| | | if (avformat_open_input(&formatContext, "", nullptr, nullptr) != 0) { |
| | | LOG(ERROR) << "Error: Could not open input file."; |
| | | avio_context_free(&avio_ctx); |
| | | avformat_close_input(&formatContext); |
| | |
| | | return false; |
| | | } |
| | | |
| | | if (avformat_find_stream_info(formatContext, NULL) < 0) { |
| | | if (avformat_find_stream_info(formatContext, nullptr) < 0) { |
| | | LOG(ERROR) << "Error: Could not find stream information."; |
| | | avio_context_free(&avio_ctx); |
| | | avformat_close_input(&formatContext); |
| | | avformat_free_context(formatContext); |
| | | return false; |
| | | } |
| | | const AVCodec* codec = NULL; |
| | | AVCodecParameters* codecParameters = NULL; |
| | | const AVCodec* codec = nullptr; |
| | | AVCodecParameters* codecParameters = nullptr; |
| | | int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0); |
| | | if (audioStreamIndex >= 0) { |
| | | codecParameters = formatContext->streams[audioStreamIndex]->codecpar; |
| | |
| | | avcodec_free_context(&codecContext); |
| | | return false; |
| | | } |
| | | if (avcodec_open2(codecContext, codec, NULL) < 0) { |
| | | if (avcodec_open2(codecContext, codec, nullptr) < 0) { |
| | | LOG(ERROR) << "Error: Could not open audio decoder."; |
| | | avio_context_free(&avio_ctx); |
| | | avformat_close_input(&formatContext); |
| | |
| | | av_packet_free(&packet); |
| | | av_frame_free(&frame); |
| | | |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | offset = 0; |
| | | |
| | | speech_len = (resampled_buffers.size()) / 2; |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)resampled_buffers.data(), speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | if(speech_data){ |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | float scale = 1; |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | scale = 32768.0f; |
| | | } |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | for (int32_t i = 0; i < speech_len; ++i) { |
| | | int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) | resampled_buffers[2 * i]); |
| | | speech_data[i] = (float)val / scale; |
| | | } |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | return true; |
| | | } |
| | | else |
| | | }else{ |
| | | return false; |
| | | } |
| | | |
| | | #endif |
| | | } |
| | | |
| | |
| | | bool Audio::LoadWav(const char *filename, int32_t* sampling_rate, bool resample) |
| | | { |
| | | WaveHeader header; |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | if (speech_buff != nullptr) { |
| | | free(speech_buff); |
| | | speech_buff = nullptr; |
| | | } |
| | | |
| | | offset = 0; |
| | |
| | | bool Audio::LoadWav2Char(const char *filename, int32_t* sampling_rate) |
| | | { |
| | | WaveHeader header; |
| | | if (speech_char != NULL) { |
| | | if (speech_char != nullptr) { |
| | | free(speech_char); |
| | | speech_char = nullptr; |
| | | } |
| | | offset = 0; |
| | | std::ifstream is(filename, std::ifstream::binary); |
| | |
| | | bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate) |
| | | { |
| | | WaveHeader header; |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | if (speech_buff != nullptr) { |
| | | free(speech_buff); |
| | | speech_buff = nullptr; |
| | | } |
| | | offset = 0; |
| | | |
| | | std::memcpy(&header, buf, sizeof(header)); |
| | | |
| | |
| | | |
| | | bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate) |
| | | { |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | offset = 0; |
| | | |
| | | speech_len = n_buf_len / 2; |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | if(speech_data){ |
| | | float scale = 1; |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | scale = 32768.0f; |
| | | } |
| | | const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf); |
| | | for (int32_t i = 0; i < speech_len; ++i) { |
| | | int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) | byte_buf[2 * i]); |
| | | speech_data[i] = (float)val / scale; |
| | | } |
| | | |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != dest_sample_rate){ |
| | | WavResample(*sampling_rate, speech_data, speech_len); |
| | |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | return true; |
| | | |
| | | } |
| | | else |
| | | }else{ |
| | | return false; |
| | | } |
| | | } |
| | | |
| | | bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate) |
| | | { |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | } |
| | | if (speech_buff != NULL) { |
| | | free(speech_buff); |
| | | } |
| | | if (speech_char != NULL) { |
| | | free(speech_char); |
| | | speech_data = nullptr; |
| | | } |
| | | |
| | | speech_len = n_buf_len / 2; |
| | | speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); |
| | | if (speech_buff) |
| | | { |
| | | memset(speech_buff, 0, sizeof(int16_t) * speech_len); |
| | | memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t)); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | memset(speech_data, 0, sizeof(float) * speech_len); |
| | | |
| | | speech_data = (float*)malloc(sizeof(float) * speech_len); |
| | | if(speech_data){ |
| | | float scale = 1; |
| | | if (data_type == 1) { |
| | | scale = 32768; |
| | | scale = 32768.0f; |
| | | } |
| | | const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf); |
| | | for (int32_t i = 0; i < speech_len; ++i) { |
| | | int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) | byte_buf[2 * i]); |
| | | speech_data[i] = (float)val / scale; |
| | | } |
| | | |
| | | for (int32_t i = 0; i != speech_len; ++i) { |
| | | speech_data[i] = (float)speech_buff[i] / scale; |
| | | } |
| | | |
| | | //resample |
| | | if(*sampling_rate != dest_sample_rate){ |
| | | WavResample(*sampling_rate, speech_data, speech_len); |
| | |
| | | |
| | | AudioFrame* frame = new AudioFrame(speech_len); |
| | | frame_queue.push(frame); |
| | | |
| | | return true; |
| | | |
| | | } |
| | | else |
| | | }else{ |
| | | return false; |
| | | } |
| | | } |
| | | |
| | | bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate, bool resample) |
| | | { |
| | | if (speech_data != NULL) { |
| | | if (speech_data != nullptr) { |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | } |
| | | if (speech_buff != NULL) { |
| | | if (speech_buff != nullptr) { |
| | | free(speech_buff); |
| | | speech_buff = nullptr; |
| | | } |
| | | offset = 0; |
| | | |
| | |
| | | |
| | | bool Audio::LoadPcmwav2Char(const char* filename, int32_t* sampling_rate) |
| | | { |
| | | if (speech_char != NULL) { |
| | | if (speech_char != nullptr) { |
| | | free(speech_char); |
| | | speech_char = nullptr; |
| | | } |
| | | offset = 0; |
| | | |
| | |
| | | |
| | | bool Audio::LoadOthers2Char(const char* filename) |
| | | { |
| | | if (speech_char != NULL) { |
| | | if (speech_char != nullptr) { |
| | | free(speech_char); |
| | | speech_char = nullptr; |
| | | } |
| | | |
| | | FILE* fp; |
| | |
| | | new_data[tmp_off + i] = speech_data[ii]; |
| | | } |
| | | free(speech_data); |
| | | speech_data = nullptr; |
| | | speech_data = new_data; |
| | | speech_len = num_new_samples; |
| | | |
| | |
| | | frame_queue.pop(); |
| | | int sp_len = frame->GetLen(); |
| | | delete frame; |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | |
| | | std::vector<float> pcm_data(speech_data, speech_data+sp_len); |
| | | vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data); |
| | |
| | | frame->SetStart(start); |
| | | frame->SetEnd(end); |
| | | frame_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | } |
| | | |
| | |
| | | frame_queue.pop(); |
| | | int sp_len = frame->GetLen(); |
| | | delete frame; |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | |
| | | std::vector<float> pcm_data(speech_data, speech_data+sp_len); |
| | | vad_segments = vad_obj->Infer(pcm_data, input_finished); |
| | |
| | | frame_queue.pop(); |
| | | int sp_len = frame->GetLen(); |
| | | delete frame; |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | |
| | | std::vector<float> pcm_data(speech_data, speech_data+sp_len); |
| | | vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished); |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * step); |
| | | memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float)); |
| | | asr_online_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | speech_start += step/seg_sample; |
| | | } |
| | | } |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * (end-start)); |
| | | memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float)); |
| | | asr_online_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | |
| | | if(asr_mode != ASR_ONLINE){ |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * (end-start)); |
| | | memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float)); |
| | | asr_offline_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | |
| | | speech_start = -1; |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * step); |
| | | memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float)); |
| | | asr_online_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | speech_start += step/seg_sample; |
| | | } |
| | | } |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * (end-offline_start)); |
| | | memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float)); |
| | | asr_offline_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | |
| | | if(asr_mode != ASR_OFFLINE){ |
| | |
| | | frame->data = (float*)malloc(sizeof(float) * step); |
| | | memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float)); |
| | | asr_online_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | }else{ |
| | | frame = new AudioFrame(0); |
| | |
| | | frame->global_start = speech_start; // in this case start >= end |
| | | frame->global_end = speech_end_i; |
| | | asr_online_queue.push(frame); |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | } |
| | | speech_start = -1; |
| | |
| | | std::vector<std::vector<int>> split_id_vec; |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | |
| | | LoadCfgFromYaml(cfg_file.c_str(), opt_); |
| | | while (getline(ifs_hws, line)) { |
| | |
| | | BuildGraph(split_id_vec, custom_weight); |
| | | ifs_hws.close(); |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | std::vector<std::vector<int>> split_id_vec; |
| | | |
| | | struct timeval start, end; |
| | | gettimeofday(&start, NULL); |
| | | gettimeofday(&start, nullptr); |
| | | opt_.incre_bias_ = inc_bias; |
| | | for (const pair<string, int>& kv : hws_map) { |
| | | float score = 1.0f; |
| | |
| | | } |
| | | BuildGraph(split_id_vec, custom_weight); |
| | | |
| | | gettimeofday(&end, NULL); |
| | | gettimeofday(&end, nullptr); |
| | | long seconds = (end.tv_sec - start.tv_sec); |
| | | long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); |
| | | LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s"; |
| | |
| | | } |
| | | |
| | | bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | |
| | |
| | | } |
| | | |
| | | bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | for (size_t i = 0; i < ilen; i++) { |
| | |
| | | |
| | | |
| | | bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | for (size_t i = 0; i < ilen; i++) { |
| | |
| | | } |
| | | |
| | | bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | bool flag1 = HasAlpha(pu8, ilen); |
| | |
| | | } |
| | | |
| | | bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | bool flag1 = HasAlpha(pu8, ilen); |
| | |
| | | return true; |
| | | } |
| | | bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) { |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | for (size_t i = 0; i < ilen; i++) { |
| | |
| | | bool EncodeConverter::NeedAddTailBlank(std::string str) { |
| | | U8CHAR_T *pu8 = (U8CHAR_T*)str.data(); |
| | | size_t ilen = str.size(); |
| | | if (pu8 == NULL || ilen <= 0) { |
| | | if (pu8 == nullptr || ilen <= 0) { |
| | | return false; |
| | | } |
| | | if (IsAllAlpha(pu8, ilen) || IsAllAlphaAndPunct(pu8, ilen) || IsAllAlphaAndDigit(pu8, ilen)) { |
| | |
| | | #ifdef _MSC_VER |
| | | // convert to the local ansi page |
| | | static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) { |
| | | int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0); |
| | | int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0); |
| | | unsigned short*wszGBK = new unsigned short[len + 1]; |
| | | memset(wszGBK, 0, len * 2 + 2); |
| | | MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len); |
| | | |
| | | len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, NULL, 0, NULL, NULL); |
| | | len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr); |
| | | char *szGBK = new char[len + 1]; |
| | | memset(szGBK, 0, len + 1); |
| | | WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, NULL, NULL); |
| | | WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr); |
| | | std::string strTemp(szGBK); |
| | | delete[]szGBK; |
| | | delete[]wszGBK; |
| | |
| | | |
| | | audio->Split(vad_online_handle, chunk_len, input_finished, mode); |
| | | |
| | | funasr::AudioFrame* frame = NULL; |
| | | funasr::AudioFrame* frame = nullptr; |
| | | while(audio->FetchChunck(frame) > 0){ |
| | | string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final); |
| | | if(mode == ASR_ONLINE){ |
| | |
| | | }else if(mode == ASR_TWO_PASS){ |
| | | p_result->msg += msg; |
| | | } |
| | | if(frame != NULL){ |
| | | if(frame != nullptr){ |
| | | delete frame; |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | } |
| | | |
| | |
| | | if (!(p_result->stamp).empty()){ |
| | | p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp); |
| | | } |
| | | if(frame != NULL){ |
| | | if(frame != nullptr){ |
| | | delete frame; |
| | | frame = NULL; |
| | | frame = nullptr; |
| | | } |
| | | } |
| | | |
| | |
| | | |
| | | SetJiebaRes(jieba_dict_trie_, jieba_model_); |
| | | }else { |
| | | jieba_dict_trie_ = NULL; |
| | | jieba_model_ = NULL; |
| | | jieba_dict_trie_ = nullptr; |
| | | jieba_model_ = nullptr; |
| | | } |
| | | } |
| | | |
| | |
| | | download_model_dir="/workspace/models" |
| | | model_dir="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx" |
| | | vad_dir="damo/speech_fsmn_vad_zh-cn-16k-common-onnx" |
| | | punc_dir="damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx" |
| | | itn_dir="thuduj12/fst_itn_zh" |
| | | lm_dir="damo/speech_ngram_lm_zh-cn-ai-wesp-fst" |
| | | punc_dir="" |
| | | itn_dir="" |
| | | lm_dir="" |
| | | port=10095 |
| | | certfile="../../../ssl_key/server.crt" |
| | | keyfile="../../../ssl_key/server.key" |
| | | hotword="../../hotwords.txt" |
| | | # set decoder_thread_num |
| | | decoder_thread_num=$(cat /proc/cpuinfo | grep "processor"|wc -l) || { echo "Get cpuinfo failed. Set decoder_thread_num = 32"; decoder_thread_num=32; } |
| | | decoder_thread_num=8 |
| | | multiple_io=16 |
| | | io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io )) |
| | | model_thread_num=1 |
| | | model_thread_num=5 |
| | | |
| | | . ../egs/aishell/transformer/utils/parse_options.sh || exit 1; |
| | | . ./tools/utils/parse_options.sh || exit 1; |
| | | |
| | | if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then |
| | | certfile="" |
| | |
| | | io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io )) |
| | | model_thread_num=1 |
| | | |
| | | . ../egs/aishell/transformer/utils/parse_options.sh || exit 1; |
| | | . ./tools/utils/parse_options.sh || exit 1; |
| | | |
| | | if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then |
| | | certfile="" |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | # Copyright 2012 Johns Hopkins University (Author: Daniel Povey); |
| | | # Arnab Ghoshal, Karel Vesely |
| | | |
| | | # Licensed under the Apache License, Version 2.0 (the "License"); |
| | | # you may not use this file except in compliance with the License. |
| | | # You may obtain a copy of the License at |
| | | # |
| | | # http://www.apache.org/licenses/LICENSE-2.0 |
| | | # |
| | | # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| | | # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED |
| | | # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, |
| | | # MERCHANTABLITY OR NON-INFRINGEMENT. |
| | | # See the Apache 2 License for the specific language governing permissions and |
| | | # limitations under the License. |
| | | |
| | | |
| | | # Parse command-line options. |
| | | # To be sourced by another script (as in ". parse_options.sh"). |
| | | # Option format is: --option-name arg |
| | | # and shell variable "option_name" gets set to value "arg." |
| | | # The exception is --help, which takes no arguments, but prints the |
| | | # $help_message variable (if defined). |
| | | |
| | | |
| | | ### |
| | | ### The --config file options have lower priority to command line |
| | | ### options, so we need to import them first... |
| | | ### |
| | | |
| | | # Now import all the configs specified by command-line, in left-to-right order |
| | | for ((argpos=1; argpos<$#; argpos++)); do |
| | | if [ "${!argpos}" == "--config" ]; then |
| | | argpos_plus1=$((argpos+1)) |
| | | config=${!argpos_plus1} |
| | | [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 |
| | | . $config # source the config file. |
| | | fi |
| | | done |
| | | |
| | | |
| | | ### |
| | | ### Now we process the command line options |
| | | ### |
| | | while true; do |
| | | [ -z "${1:-}" ] && break; # break if there are no arguments |
| | | case "$1" in |
| | | # If the enclosing script is called with --help option, print the help |
| | | # message and exit. Scripts should put help messages in $help_message |
| | | --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; |
| | | else printf "$help_message\n" 1>&2 ; fi; |
| | | exit 0 ;; |
| | | --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" |
| | | exit 1 ;; |
| | | # If the first command-line argument begins with "--" (e.g. --foo-bar), |
| | | # then work out the variable name as $name, which will equal "foo_bar". |
| | | --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; |
| | | # Next we test whether the variable in question is undefned-- if so it's |
| | | # an invalid option and we die. Note: $0 evaluates to the name of the |
| | | # enclosing script. |
| | | # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar |
| | | # is undefined. We then have to wrap this test inside "eval" because |
| | | # foo_bar is itself inside a variable ($name). |
| | | eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; |
| | | |
| | | oldval="`eval echo \\$$name`"; |
| | | # Work out whether we seem to be expecting a Boolean argument. |
| | | if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then |
| | | was_bool=true; |
| | | else |
| | | was_bool=false; |
| | | fi |
| | | |
| | | # Set the variable to the right value-- the escaped quotes make it work if |
| | | # the option had spaces, like --cmd "queue.pl -sync y" |
| | | eval $name=\"$2\"; |
| | | |
| | | # Check that Boolean-valued arguments are really Boolean. |
| | | if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then |
| | | echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 |
| | | exit 1; |
| | | fi |
| | | shift 2; |
| | | ;; |
| | | *) break; |
| | | esac |
| | | done |
| | | |
| | | |
| | | # Check for an empty argument to the --cmd option, which can easily occur as a |
| | | # result of scripting errors. |
| | | [ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; |
| | | |
| | | |
| | | true; # so this script returns exit code 0. |
| | |
| | | } |
| | | |
| | | // hotwords: fst/nn |
| | | if(msg_data->hotwords_embedding == NULL){ |
| | | if(msg_data->hotwords_embedding == nullptr){ |
| | | std::unordered_map<std::string, int> merged_hws_map; |
| | | std::string nn_hotwords = ""; |
| | | |
| | |
| | | msg_data->msg["audio_fs"] = jsonresult["audio_fs"]; |
| | | } |
| | | if (jsonresult.contains("chunk_size")) { |
| | | if (msg_data->tpass_online_handle == NULL) { |
| | | if (msg_data->tpass_online_handle == nullptr) { |
| | | std::vector<int> chunk_size_vec = |
| | | jsonresult["chunk_size"].get<std::vector<int>>(); |
| | | // check chunk_size_vec |
| | |
| | | if ((jsonresult["is_speaking"] == false || |
| | | jsonresult["is_finished"] == true) && |
| | | msg_data->msg["is_eof"] != true && |
| | | msg_data->hotwords_embedding != NULL) { |
| | | msg_data->hotwords_embedding != nullptr) { |
| | | LOG(INFO) << "client done"; |
| | | |
| | | // if it is in final message, post the sample_data to decode |
| | |
| | | |
| | | try{ |
| | | // post to decode |
| | | if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != NULL) { |
| | | if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != nullptr) { |
| | | std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding)); |
| | | msg_data->strand_->post( |
| | | std::bind(&WebSocketServer::do_decoder, this, |
| | |
| | | nlohmann::json msg; |
| | | std::shared_ptr<std::vector<char>> samples; |
| | | std::shared_ptr<std::vector<std::vector<std::string>>> punc_cache; |
| | | std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL; |
| | | std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr; |
| | | std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection |
| | | FUNASR_HANDLE tpass_online_handle=NULL; |
| | | FUNASR_HANDLE tpass_online_handle=nullptr; |
| | | std::string online_res = ""; |
| | | std::string tpass_res = ""; |
| | | std::shared_ptr<asio::io_context::strand> strand_; // for data execute in order |
| | | FUNASR_DEC_HANDLE decoder_handle=NULL; |
| | | FUNASR_DEC_HANDLE decoder_handle=nullptr; |
| | | } FUNASR_MESSAGE; |
| | | |
| | | // See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about |
| | |
| | | asio::io_context& io_decoder_; // threads for asr decoder |
| | | // std::ofstream fout; |
| | | // FUNASR_HANDLE asr_handle; // asr engine handle |
| | | FUNASR_HANDLE tpass_handle=NULL; |
| | | FUNASR_HANDLE tpass_handle=nullptr; |
| | | bool isonline = true; // online or offline engine, now only support offline |
| | | bool is_ssl = true; |
| | | server* server_; // websocket server |
| | |
| | | std::string stamp_sents=""; |
| | | try{ |
| | | FUNASR_RESULT Result = FunOfflineInferBuffer( |
| | | asr_handle, buffer.data(), buffer.size(), RASR_NONE, NULL, |
| | | asr_handle, buffer.data(), buffer.size(), RASR_NONE, nullptr, |
| | | hotwords_embedding, audio_fs, wav_format, itn, decoder_handle); |
| | | if (Result != NULL){ |
| | | if (Result != nullptr){ |
| | | asr_result = FunASRGetResult(Result, 0); // get decode result |
| | | stamp_res = FunASRGetStamp(Result); |
| | | stamp_sents = FunASRGetStampSents(Result); |
| | | FunASRFreeResult(Result); |
| | | } else{ |
| | | LOG(ERROR) << "FUNASR_RESULT is NULL."; |
| | | std::this_thread::sleep_for(std::chrono::milliseconds(20)); |
| | | LOG(ERROR) << "FUNASR_RESULT is nullptr."; |
| | | } |
| | | }catch (std::exception const& e) { |
| | | LOG(ERROR) << e.what(); |
| | |
| | | } |
| | | |
| | | // hotwords: fst/nn |
| | | if(msg_data->hotwords_embedding == NULL){ |
| | | if(msg_data->hotwords_embedding == nullptr){ |
| | | std::unordered_map<std::string, int> merged_hws_map; |
| | | std::string nn_hotwords = ""; |
| | | |
| | |
| | | if ((jsonresult["is_speaking"] == false || |
| | | jsonresult["is_finished"] == true) && |
| | | msg_data->msg["is_eof"] != true && |
| | | msg_data->hotwords_embedding != NULL) { |
| | | msg_data->hotwords_embedding != nullptr) { |
| | | LOG(INFO) << "client done"; |
| | | // for offline, send all receive data to decoder engine |
| | | std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding)); |
| | |
| | | typedef struct { |
| | | nlohmann::json msg; |
| | | std::shared_ptr<std::vector<char>> samples; |
| | | std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL; |
| | | std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr; |
| | | std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection |
| | | FUNASR_DEC_HANDLE decoder_handle=NULL; |
| | | FUNASR_DEC_HANDLE decoder_handle=nullptr; |
| | | } FUNASR_MESSAGE; |
| | | |
| | | // See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about |