python/FunASR-XL.git

parent: ce92fde1 | 补丁 | 提交 | ignore whitespace

Merge pull request #1249 from alibaba-damo-academy/main

Shi Xian

2024-01-16 948b68774cebf2b9a2994b7b9b8102f9637a98f3

Merge pull request #1249 from alibaba-damo-academy/main

code sync

25个文件已修改

1个文件已添加

	README.md	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	README_zh.md	36 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-2pass.cpp	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-offline-punc.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp	18 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-offline.cpp	16 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-online-asr.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-online-punc.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-online-rtf.cpp	12 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/alignedmem.cpp	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/audio.cpp	233 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/bias-lm.h	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/encode_converter.cpp	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/encode_converter.h	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/funasrruntime.cpp	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/onnxruntime/src/tokenizer.cpp	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/run_server.sh	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/run_server_2pass.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/tools/utils/parse_options.sh	97 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/websocket/bin/websocket-server-2pass.cpp	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/websocket/bin/websocket-server-2pass.h	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/websocket/bin/websocket-server.cpp	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	runtime/websocket/bin/websocket-server.h	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 README.md

@@ -61,7 +61,6 @@
|                paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [🤗]() )                | speech recognition with speaker diarization, with timestamps, non-streaming |      60000 hours, Mandarin       |    220M    |
| <nobr>paraformer-zh-online <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )</nobr> |                        speech recognition, streaming                        |      60000 hours, Mandarin       |    220M    |
|         paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() )         |             speech recognition, with timestamps, non-streaming              |       50000 hours, English       |    220M    |
|                                                               paraformer-en-spk <br> ([⭐]()[🤗]()  )                                                               |         speech recognition with speaker diarization, non-streaming          |               Undo               |    Undo    |
|                     conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() )                      |                      speech recognition, non-streaming                      |       50000 hours, English       |    220M    |
|                     ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() )                      |                           punctuation restoration                           |    100M, Mandarin and English    |    1.1G    | 
|                          fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() )                          |                          voice activity detection                           | 5000 hours, Mandarin and English |    0.4M    | 
@@ -97,7 +96,7 @@
                  punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
                  spk_model="cam++", spk_model_revision="v2.0.2")
res = model(input=f"{model.model_path}/example/asr_example.wav", 
            batch_size=16, 
            batch_size=64, 
            hotword='魔搭')
print(res)
```
@@ -135,7 +134,6 @@
from funasr import AutoModel

model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")

wav_file = f"{model.model_path}/example/asr_example.wav"
res = model(input=wav_file)
print(res)
@@ -167,7 +165,6 @@
from funasr import AutoModel

model = AutoModel(model="ct-punc", model_revision="v2.0.2")

res = model(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
```
@@ -176,9 +173,8 @@
from funasr import AutoModel

model = AutoModel(model="fa-zh", model_revision="v2.0.2")

wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```

 README_zh.md

@@ -60,14 +60,13 @@
|                                                                             模型名字                                                                             |        任务详情        |     训练数据     | 参数量  |
|:------------------------------------------------------------------------------------------------------------------------------------------------------------:|:------------------:|:------------:|:----:|
| paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [🤗]() ) |  语音识别，带时间戳输出，非实时   |  60000小时，中文  | 220M |
|             paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [🤗]() )             | 分角色语音识别，带时间戳输出，非实时 |  60000小时，中文  | 220M |
|   paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )   |      语音识别，实时       |  60000小时，中文  | 220M |
|      paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() )      | 语音识别，非实时 |  50000小时，英文  | 220M |
|                                                            paraformer-en-spk <br> ([⭐]() [🤗]() )                                                            |      语音识别，非实时      |  50000小时，英文  | 220M |
|                  conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() )                   |      语音识别，非实时      |  50000小时，英文  | 220M |
|                  ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() )                   |      标点恢复      |  100M，中文与英文  | 1.1G | 
|                       fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() )                       |     语音端点检测，实时      | 5000小时，中文与英文 | 0.4M | 
|                       fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() )                        |   字级别时间戳预测         |  50000小时，中文  | 38M  |
| paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary)  [🤗]() )             | 分角色语音识别，带时间戳输出，非实时 |  60000小时，中文  | 220M |
| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )   |      语音识别，实时       |  60000小时，中文  | 220M |
| paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() )      | 语音识别，非实时 |  50000小时，英文  | 220M |
| conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() )                   |      语音识别，非实时      |  50000小时，英文  | 220M |
| ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() )                   |      标点恢复      |  100M，中文与英文  | 1.1G | 
| fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() )                       |     语音端点检测，实时      | 5000小时，中文与英文 | 0.4M | 
| fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() )                        |   字级别时间戳预测         |  50000小时，中文  | 38M  |


<a name="快速开始"></a>
@@ -86,12 +85,15 @@
### 非实时语音识别
```python
from funasr import AutoModel

model = AutoModel(model="paraformer-zh")
# for the long duration wav, you could add vad model
# model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc")

res = model(input="asr_example_zh.wav", batch_size=64)
# paraformer-zh is a multi-functional asr model
# use vad, punc, spk or not as you need
model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
                  vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
                  punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
                  spk_model="cam++", spk_model_revision="v2.0.2")
res = model(input=f"{model.model_path}/example/asr_example.wav", 
            batch_size=64, 
            hotword='魔搭')
print(res)
```
注：`model_hub`：表示模型仓库，`ms`为选择modelscope下载，`hf`为选择huggingface下载。
@@ -105,7 +107,7 @@
encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention

model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.0")
model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")

import soundfile
import os
@@ -163,7 +165,7 @@
```python
from funasr import AutoModel

model = AutoModel(model="ct-punc", model_revision="v2.0.1")
model = AutoModel(model="ct-punc", model_revision="v2.0.2")

res = model(input="那今天的会就到这里吧 happy new year 明年见")
print(res)
@@ -176,7 +178,7 @@
model = AutoModel(model="fa-zh", model_revision="v2.0.0")

wav_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/asr_example.wav"
text_file = f"{model.model_path}/example/text.txt"
res = model(input=(wav_file, text_file), data_type=("sound", "text"))
print(res)
```

 runtime/onnxruntime/bin/funasr-onnx-2pass-rtf.cpp

@@ -144,10 +144,10 @@
                } else {
                    is_final = false;
            }
            gettimeofday(&start, NULL);
            gettimeofday(&start, nullptr);
            FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle, speech_buff+sample_offset, step, punc_cache, is_final, 
                                                        sampling_rate_, "pcm", (ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle);
            gettimeofday(&end, NULL);
            gettimeofday(&end, nullptr);
            seconds = (end.tv_sec - start.tv_sec);
            long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
            n_total_time += taking_micros;
@@ -272,7 +272,7 @@
    GetValue(asr_mode, ASR_MODE, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = onnx_thread.getValue();
    int asr_mode_ = -1;
    if(model_path[ASR_MODE] == "offline"){
@@ -301,7 +301,7 @@
        am_sc = am_scale.getValue();
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

 runtime/onnxruntime/bin/funasr-onnx-2pass.cpp

@@ -97,7 +97,7 @@
    GetValue(asr_mode, ASR_MODE, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = onnx_thread.getValue();
    int asr_mode_ = -1;
    if(model_path[ASR_MODE] == "offline"){
@@ -128,7 +128,7 @@
    // init wfst decoder
    FUNASR_DEC_HANDLE decoder_handle = FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, glob_beam, lat_beam, am_sc);

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -214,11 +214,11 @@
                } else {
                    is_final = false;
            }
            gettimeofday(&start, NULL);
            gettimeofday(&start, nullptr);
            FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle, 
                speech_buff+sample_offset, step, punc_cache, is_final, sampling_rate_, "pcm", 
                (ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle);
            gettimeofday(&end, NULL);
            gettimeofday(&end, nullptr);
            seconds = (end.tv_sec - start.tv_sec);
            taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);


 runtime/onnxruntime/bin/funasr-onnx-offline-punc.cpp

@@ -49,7 +49,7 @@
    GetValue(txt_path, TXT_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num);

@@ -59,7 +59,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -83,9 +83,9 @@
    
    long taking_micros = 0;
    for(auto& txt_str : txt_list){
        gettimeofday(&start, NULL);
        FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, NULL);
        gettimeofday(&end, NULL);
        gettimeofday(&start, nullptr);
        FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, nullptr);
        gettimeofday(&end, nullptr);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
        string msg = FunASRGetResult(result, 0);

 runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp

@@ -54,7 +54,7 @@
    // warm up
    for (size_t i = 0; i < 1; i++)
    {
        FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle);
        FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle);
        if(result){
            FunASRFreeResult(result);
        }
@@ -67,10 +67,10 @@
            break;
        }

        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle);
        gettimeofday(&start, nullptr);
        FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle);

        gettimeofday(&end, NULL);
        gettimeofday(&end, nullptr);
        seconds = (end.tv_sec - start.tv_sec);
        long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
        n_total_time += taking_micros;
@@ -115,10 +115,8 @@

void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
{
    if (value_arg.isSet()){
        model_path.insert({key, value_arg.getValue()});
        LOG(INFO)<< key << " : " << value_arg.getValue();
    }
    model_path.insert({key, value_arg.getValue()});
    LOG(INFO)<< key << " : " << value_arg.getValue();
}

int main(int argc, char *argv[])
@@ -176,7 +174,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    FUNASR_HANDLE asr_handle=FunOfflineInit(model_path, 1);

    if (!asr_handle)
@@ -185,7 +183,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

 runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp

@@ -82,7 +82,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num);

@@ -92,7 +92,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -132,9 +132,9 @@
    for (int i = 0; i < wav_list.size(); i++) {
        auto& wav_file = wav_list[i];
        auto& wav_id = wav_ids[i];
        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), NULL, audio_fs.getValue());
        gettimeofday(&end, NULL);
        gettimeofday(&start, nullptr);
        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), nullptr, audio_fs.getValue());
        gettimeofday(&end, nullptr);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);


 runtime/onnxruntime/bin/funasr-onnx-offline.cpp

@@ -32,10 +32,8 @@

void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
{
    if (value_arg.isSet()){
        model_path.insert({key, value_arg.getValue()});
        LOG(INFO)<< key << " : " << value_arg.getValue();
    }
    model_path.insert({key, value_arg.getValue()});
    LOG(INFO)<< key << " : " << value_arg.getValue();
}

int main(int argc, char** argv)
@@ -89,7 +87,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE asr_hanlde=FunOfflineInit(model_path, thread_num);

@@ -116,7 +114,7 @@
    LOG(INFO) << "hotword path: " << hotword_path;
    funasr::ExtractHws(hotword_path, hws_map, nn_hotwords_);

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -158,9 +156,9 @@
    for (int i = 0; i < wav_list.size(); i++) {
        auto& wav_file = wav_list[i];
        auto& wav_id = wav_ids[i];
        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs.getValue(), true, decoder_handle);
        gettimeofday(&end, NULL);
        gettimeofday(&start, nullptr);
        FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs.getValue(), true, decoder_handle);
        gettimeofday(&end, nullptr);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);


 runtime/onnxruntime/bin/funasr-onnx-online-asr.cpp

@@ -63,7 +63,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE asr_handle=FunASRInit(model_path, thread_num, ASR_ONLINE);

@@ -73,7 +73,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -144,9 +144,9 @@
                } else {
                    is_final = false;
            }
            gettimeofday(&start, NULL);
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
            gettimeofday(&end, NULL);
            gettimeofday(&start, nullptr);
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
            gettimeofday(&end, nullptr);
            seconds = (end.tv_sec - start.tv_sec);
            taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);


 runtime/onnxruntime/bin/funasr-onnx-online-punc.cpp

@@ -69,7 +69,7 @@
    GetValue(txt_path, TXT_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num, PUNC_ONLINE);

@@ -79,7 +79,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -107,16 +107,16 @@
        splitString(vad_strs, txt_str, "|");
        string str_out;
        FUNASR_RESULT result = nullptr;
        gettimeofday(&start, NULL);
        gettimeofday(&start, nullptr);
        for(auto& vad_str:vad_strs){
            result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, NULL, PUNC_ONLINE, result);
            result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, nullptr, PUNC_ONLINE, result);
            if(result){
                string msg = CTTransformerGetResult(result, 0);
                str_out += msg;
                LOG(INFO)<<"Online result: "<<msg;
            }
        }
        gettimeofday(&end, NULL);
        gettimeofday(&end, nullptr);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
        LOG(INFO)<<"Results: "<<str_out;

 runtime/onnxruntime/bin/funasr-onnx-online-rtf.cpp

@@ -84,7 +84,7 @@
                } else {
                    is_final = false;
            }
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
            if (result)
            {
                FunASRFreeResult(result);
@@ -130,9 +130,9 @@
                } else {
                    is_final = false;
            }
            gettimeofday(&start, NULL);
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
            gettimeofday(&end, NULL);
            gettimeofday(&start, nullptr);
            FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
            gettimeofday(&end, nullptr);
            seconds = (end.tv_sec - start.tv_sec);
            long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
            n_total_time += taking_micros;
@@ -210,7 +210,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    FUNASR_HANDLE asr_handle=FunASRInit(model_path, 1, ASR_ONLINE);

    if (!asr_handle)
@@ -219,7 +219,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

 runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp

@@ -89,7 +89,7 @@
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    int thread_num = 1;
    FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num);

@@ -99,7 +99,7 @@
        exit(-1);
    }

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
@@ -170,9 +170,9 @@
                } else {
                    is_final = false;
            }
            gettimeofday(&start, NULL);
            FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, NULL, is_final, sampling_rate_);
            gettimeofday(&end, NULL);
            gettimeofday(&start, nullptr);
            FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, nullptr, is_final, sampling_rate_);
            gettimeofday(&end, nullptr);
            seconds = (end.tv_sec - start.tv_sec);
            taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);


 runtime/onnxruntime/src/alignedmem.cpp

@@ -6,8 +6,8 @@
    void *p1;  // original block
    void **p2; // aligned block
    int offset = alignment - 1 + sizeof(void *);
    if ((p1 = (void *)malloc(required_bytes + offset)) == NULL) {
        return NULL;
    if ((p1 = (void *)malloc(required_bytes + offset)) == nullptr) {
        return nullptr;
    }
    p2 = (void **)(((size_t)(p1) + offset) & ~(alignment - 1));
    p2[-1] = p1;

 runtime/onnxruntime/src/audio.cpp

@@ -133,6 +133,7 @@
    };
    ~AudioWindow(){
        free(window);
        window = nullptr;
    };
    int put(int val)
    {
@@ -160,8 +161,9 @@
    len = end - start;
}
AudioFrame::~AudioFrame(){
    if(data != NULL){
    if(data != nullptr){
        free(data);
        data = nullptr;
    }
}
int AudioFrame::SetStart(int val)
@@ -195,38 +197,41 @@

Audio::Audio(int data_type) : dest_sample_rate(MODEL_SAMPLE_RATE), data_type(data_type)
{
    speech_buff = NULL;
    speech_data = NULL;
    speech_buff = nullptr;
    speech_data = nullptr;
    align_size = 1360;
    seg_sample = dest_sample_rate / 1000;
}

Audio::Audio(int model_sample_rate, int data_type) : dest_sample_rate(model_sample_rate), data_type(data_type)
{
    speech_buff = NULL;
    speech_data = NULL;
    speech_buff = nullptr;
    speech_data = nullptr;
    align_size = 1360;
    seg_sample = dest_sample_rate / 1000;
}

Audio::Audio(int model_sample_rate, int data_type, int size) : dest_sample_rate(model_sample_rate), data_type(data_type)
{
    speech_buff = NULL;
    speech_data = NULL;
    speech_buff = nullptr;
    speech_data = nullptr;
    align_size = (float)size;
    seg_sample = dest_sample_rate / 1000;
}

Audio::~Audio()
{
    if (speech_buff != NULL) {
    if (speech_buff != nullptr) {
        free(speech_buff);
        speech_buff = nullptr;
    }
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_char != NULL) {
    if (speech_char != nullptr) {
        free(speech_char);
        speech_char = nullptr;
    }
    ClearQueue(frame_queue);
    ClearQueue(asr_online_queue);
@@ -269,8 +274,9 @@
    resampler->Resample(waveform, n, true, &samples);
    //reset speech_data
    speech_len = samples.size();
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    speech_data = (float*)malloc(sizeof(float) * speech_len);
    memset(speech_data, 0, sizeof(float) * speech_len);
@@ -283,21 +289,21 @@
#else
    // from file
    AVFormatContext* formatContext = avformat_alloc_context();
    if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
    if (avformat_open_input(&formatContext, filename, nullptr, nullptr) != 0) {
        LOG(ERROR) << "Error: Could not open input file.";
        avformat_close_input(&formatContext);
        avformat_free_context(formatContext);
        return false;
    }

    if (avformat_find_stream_info(formatContext, NULL) < 0) {
    if (avformat_find_stream_info(formatContext, nullptr) < 0) {
        LOG(ERROR) << "Error: Could not open input file.";
        avformat_close_input(&formatContext);
        avformat_free_context(formatContext);
        return false;
    }
    const AVCodec* codec = NULL;
    AVCodecParameters* codecParameters = NULL;
    const AVCodec* codec = nullptr;
    AVCodecParameters* codecParameters = nullptr;
    int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
    if (audioStreamIndex >= 0) {
        codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
@@ -321,7 +327,7 @@
        avcodec_free_context(&codecContext);
        return false;
    }
    if (avcodec_open2(codecContext, codec, NULL) < 0) {
    if (avcodec_open2(codecContext, codec, nullptr) < 0) {
        LOG(ERROR) << "Error: Could not open audio decoder.";
        avformat_close_input(&formatContext);
        avformat_free_context(formatContext);
@@ -400,14 +406,13 @@
    av_packet_free(&packet);
    av_frame_free(&frame);

    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    if (speech_char != NULL) {
    if (speech_char != nullptr) {
        free(speech_char);
        speech_char = nullptr;
    }
    offset = 0;
    
@@ -418,30 +423,25 @@
    }

    speech_len = (resampled_buffers.size()) / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
    {
        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
        memcpy((void*)speech_buff, (const void*)resampled_buffers.data(), speech_len * sizeof(int16_t));

        speech_data = (float*)malloc(sizeof(float) * speech_len);
    speech_data = (float*)malloc(sizeof(float) * speech_len);
    if(speech_data){
        memset(speech_data, 0, sizeof(float) * speech_len);

        float scale = 1;
        if (data_type == 1) {
            scale = 32768;
            scale = 32768.0f;
        }
        for (int32_t i = 0; i != speech_len; ++i) {
            speech_data[i] = (float)speech_buff[i] / scale;
        for (int32_t i = 0; i < speech_len; ++i) {
            int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) | resampled_buffers[2 * i]);
            speech_data[i] = (float)val / scale;
        }

        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
    
        return true;
    }
    else
    }else{
        return false;
    }

#endif
}

@@ -468,7 +468,7 @@
    }
    AVFormatContext* formatContext = avformat_alloc_context();
    formatContext->pb = avio_ctx;
    if (avformat_open_input(&formatContext, "", NULL, NULL) != 0) {
    if (avformat_open_input(&formatContext, "", nullptr, nullptr) != 0) {
        LOG(ERROR) << "Error: Could not open input file.";
        avio_context_free(&avio_ctx);
        avformat_close_input(&formatContext);
@@ -476,15 +476,15 @@
        return false;
    }

    if (avformat_find_stream_info(formatContext, NULL) < 0) {
    if (avformat_find_stream_info(formatContext, nullptr) < 0) {
        LOG(ERROR) << "Error: Could not find stream information.";
        avio_context_free(&avio_ctx);
        avformat_close_input(&formatContext);
        avformat_free_context(formatContext);
        return false;
    }
    const AVCodec* codec = NULL;
    AVCodecParameters* codecParameters = NULL;
    const AVCodec* codec = nullptr;
    AVCodecParameters* codecParameters = nullptr;
    int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
    if (audioStreamIndex >= 0) {
        codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
@@ -505,7 +505,7 @@
        avcodec_free_context(&codecContext);
        return false;
    }
    if (avcodec_open2(codecContext, codec, NULL) < 0) {
    if (avcodec_open2(codecContext, codec, nullptr) < 0) {
        LOG(ERROR) << "Error: Could not open audio decoder.";
        avio_context_free(&avio_ctx);
        avformat_close_input(&formatContext);
@@ -590,39 +590,31 @@
    av_packet_free(&packet);
    av_frame_free(&frame);

    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    offset = 0;

    speech_len = (resampled_buffers.size()) / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
    {
        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
        memcpy((void*)speech_buff, (const void*)resampled_buffers.data(), speech_len * sizeof(int16_t));

        speech_data = (float*)malloc(sizeof(float) * speech_len);
    speech_data = (float*)malloc(sizeof(float) * speech_len);
    if(speech_data){
        memset(speech_data, 0, sizeof(float) * speech_len);

        float scale = 1;
        if (data_type == 1) {
            scale = 32768;
            scale = 32768.0f;
        }
        for (int32_t i = 0; i != speech_len; ++i) {
            speech_data[i] = (float)speech_buff[i] / scale;
        for (int32_t i = 0; i < speech_len; ++i) {
            int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) | resampled_buffers[2 * i]);
            speech_data[i] = (float)val / scale;
        }

        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
    
        return true;
    }
    else
    }else{
        return false;
    }

#endif
}

@@ -630,11 +622,13 @@
bool Audio::LoadWav(const char *filename, int32_t* sampling_rate, bool resample)
{
    WaveHeader header;
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
    if (speech_buff != nullptr) {
        free(speech_buff);
        speech_buff = nullptr;
    }
    
    offset = 0;
@@ -705,8 +699,9 @@
bool Audio::LoadWav2Char(const char *filename, int32_t* sampling_rate)
{
    WaveHeader header;
    if (speech_char != NULL) {
    if (speech_char != nullptr) {
        free(speech_char);
        speech_char = nullptr;
    }
    offset = 0;
    std::ifstream is(filename, std::ifstream::binary);
@@ -744,13 +739,14 @@
bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
{ 
    WaveHeader header;
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
    if (speech_buff != nullptr) {
        free(speech_buff);
        speech_buff = nullptr;
    }
    offset = 0;

    std::memcpy(&header, buf, sizeof(header));

@@ -790,33 +786,24 @@

bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate)
{
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    offset = 0;

    speech_len = n_buf_len / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
    {
        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
        memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));

        speech_data = (float*)malloc(sizeof(float) * speech_len);
        memset(speech_data, 0, sizeof(float) * speech_len);

    speech_data = (float*)malloc(sizeof(float) * speech_len);
    if(speech_data){
        float scale = 1;
        if (data_type == 1) {
            scale = 32768;
            scale = 32768.0f;
        }
        const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf);
        for (int32_t i = 0; i < speech_len; ++i) {
            int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) | byte_buf[2 * i]);
            speech_data[i] = (float)val / scale;
        }

        for (int32_t i = 0; i != speech_len; ++i) {
            speech_data[i] = (float)speech_buff[i] / scale;
        }
        
        //resample
        if(*sampling_rate != dest_sample_rate){
            WavResample(*sampling_rate, speech_data, speech_len);
@@ -824,44 +811,33 @@

        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
    
        return true;

    }
    else
    }else{
        return false;
    }
}

bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
{
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
    }
    if (speech_buff != NULL) {
        free(speech_buff);
    }
    if (speech_char != NULL) {
        free(speech_char);
        speech_data = nullptr;
    }

    speech_len = n_buf_len / 2;
    speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len);
    if (speech_buff)
    {
        memset(speech_buff, 0, sizeof(int16_t) * speech_len);
        memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t));

        speech_data = (float*)malloc(sizeof(float) * speech_len);
        memset(speech_data, 0, sizeof(float) * speech_len);

    speech_data = (float*)malloc(sizeof(float) * speech_len);
    if(speech_data){
        float scale = 1;
        if (data_type == 1) {
            scale = 32768;
            scale = 32768.0f;
        }
        const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf);
        for (int32_t i = 0; i < speech_len; ++i) {
            int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) | byte_buf[2 * i]);
            speech_data[i] = (float)val / scale;
        }

        for (int32_t i = 0; i != speech_len; ++i) {
            speech_data[i] = (float)speech_buff[i] / scale;
        }
        
        //resample
        if(*sampling_rate != dest_sample_rate){
            WavResample(*sampling_rate, speech_data, speech_len);
@@ -873,20 +849,22 @@

        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
    
        return true;

    }
    else
    }else{
        return false;
    }
}

bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate, bool resample)
{
    if (speech_data != NULL) {
    if (speech_data != nullptr) {
        free(speech_data);
        speech_data = nullptr;
    }
    if (speech_buff != NULL) {
    if (speech_buff != nullptr) {
        free(speech_buff);
        speech_buff = nullptr;
    }
    offset = 0;

@@ -937,8 +915,9 @@

bool Audio::LoadPcmwav2Char(const char* filename, int32_t* sampling_rate)
{
    if (speech_char != NULL) {
    if (speech_char != nullptr) {
        free(speech_char);
        speech_char = nullptr;
    }
    offset = 0;

@@ -964,8 +943,9 @@

bool Audio::LoadOthers2Char(const char* filename)
{
    if (speech_char != NULL) {
    if (speech_char != nullptr) {
        free(speech_char);
        speech_char = nullptr;
    }

    FILE* fp;
@@ -1070,6 +1050,7 @@
        new_data[tmp_off + i] = speech_data[ii];
    }
    free(speech_data);
    speech_data = nullptr;
    speech_data = new_data;
    speech_len = num_new_samples;

@@ -1088,7 +1069,7 @@
    frame_queue.pop();
    int sp_len = frame->GetLen();
    delete frame;
    frame = NULL;
    frame = nullptr;

    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
    vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
@@ -1100,7 +1081,7 @@
        frame->SetStart(start);
        frame->SetEnd(end);
        frame_queue.push(frame);
        frame = NULL;
        frame = nullptr;
    }
}

@@ -1112,7 +1093,7 @@
    frame_queue.pop();
    int sp_len = frame->GetLen();
    delete frame;
    frame = NULL;
    frame = nullptr;

    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
    vad_segments = vad_obj->Infer(pcm_data, input_finished);
@@ -1127,7 +1108,7 @@
    frame_queue.pop();
    int sp_len = frame->GetLen();
    delete frame;
    frame = NULL;
    frame = nullptr;

    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
    vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
@@ -1148,7 +1129,7 @@
                    frame->data = (float*)malloc(sizeof(float) * step);
                    memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                    asr_online_queue.push(frame);
                    frame = NULL;
                    frame = nullptr;
                    speech_start += step/seg_sample;
                }
            }
@@ -1176,7 +1157,7 @@
                    frame->data = (float*)malloc(sizeof(float) * (end-start));
                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                    asr_online_queue.push(frame);
                    frame = NULL;
                    frame = nullptr;
                }

                if(asr_mode != ASR_ONLINE){
@@ -1187,7 +1168,7 @@
                    frame->data = (float*)malloc(sizeof(float) * (end-start));
                    memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
                    asr_offline_queue.push(frame);
                    frame = NULL;
                    frame = nullptr;
                }

                speech_start = -1;
@@ -1210,7 +1191,7 @@
                        frame->data = (float*)malloc(sizeof(float) * step);
                        memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
                        asr_online_queue.push(frame);
                        frame = NULL;
                        frame = nullptr;
                        speech_start += step/seg_sample;
                    }
                }
@@ -1235,7 +1216,7 @@
                    frame->data = (float*)malloc(sizeof(float) * (end-offline_start));
                    memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
                    asr_offline_queue.push(frame);
                    frame = NULL;
                    frame = nullptr;
                }

                if(asr_mode != ASR_OFFLINE){
@@ -1253,7 +1234,7 @@
                            frame->data = (float*)malloc(sizeof(float) * step);
                            memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
                            asr_online_queue.push(frame);
                            frame = NULL;
                            frame = nullptr;
                        }
                    }else{
                        frame = new AudioFrame(0);
@@ -1261,7 +1242,7 @@
                        frame->global_start = speech_start;   // in this case start >= end
                        frame->global_end = speech_end_i;
                        asr_online_queue.push(frame);
                        frame = NULL;
                        frame = nullptr;
                    }
                }
                speech_start = -1;

 runtime/onnxruntime/src/bias-lm.h

@@ -48,7 +48,7 @@
    std::vector<std::vector<int>> split_id_vec;

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);

    LoadCfgFromYaml(cfg_file.c_str(), opt_);
    while (getline(ifs_hws, line)) {
@@ -86,7 +86,7 @@
    BuildGraph(split_id_vec, custom_weight);
    ifs_hws.close();

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";
@@ -99,7 +99,7 @@
    std::vector<std::vector<int>> split_id_vec;

    struct timeval start, end;
    gettimeofday(&start, NULL);
    gettimeofday(&start, nullptr);
    opt_.incre_bias_ = inc_bias;
    for (const pair<string, int>& kv : hws_map) {
      float score = 1.0f;
@@ -128,7 +128,7 @@
    }
    BuildGraph(split_id_vec, custom_weight);

    gettimeofday(&end, NULL);
    gettimeofday(&end, nullptr);
    long seconds = (end.tv_sec - start.tv_sec);
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";

 runtime/onnxruntime/src/encode_converter.cpp

@@ -441,7 +441,7 @@
}

bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) {
    if (pu8 == NULL || ilen <= 0) {
    if (pu8 == nullptr || ilen <= 0) {
        return false;
    }

@@ -458,7 +458,7 @@
}

bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) {
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  for (size_t i = 0; i < ilen; i++) {
@@ -471,7 +471,7 @@


bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) {
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  for (size_t i = 0; i < ilen; i++) {
@@ -483,7 +483,7 @@
}

bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) {
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  bool flag1 = HasAlpha(pu8, ilen);
@@ -500,7 +500,7 @@
}

bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) {
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  bool flag1 = HasAlpha(pu8, ilen);
@@ -516,7 +516,7 @@
  return true;
}
bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) {
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  for (size_t i = 0; i < ilen; i++) {
@@ -529,7 +529,7 @@
bool EncodeConverter::NeedAddTailBlank(std::string str) {
  U8CHAR_T *pu8 = (U8CHAR_T*)str.data();
  size_t ilen = str.size();
  if (pu8 == NULL || ilen <= 0) {
  if (pu8 == nullptr || ilen <= 0) {
    return false;
  }
  if (IsAllAlpha(pu8, ilen) || IsAllAlphaAndPunct(pu8, ilen) || IsAllAlphaAndDigit(pu8, ilen)) {

 runtime/onnxruntime/src/encode_converter.h

@@ -88,15 +88,15 @@
#ifdef _MSC_VER
        // convert to the local ansi page
        static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) {
            int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
            int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0);
            unsigned short*wszGBK = new unsigned short[len + 1];
            memset(wszGBK, 0, len * 2 + 2);
            MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len);

            len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, NULL, 0, NULL, NULL);
            len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr);
            char *szGBK = new char[len + 1];
            memset(szGBK, 0, len + 1);
            WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, NULL, NULL);
            WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr);
            std::string strTemp(szGBK);
            delete[]szGBK;
            delete[]wszGBK;

 runtime/onnxruntime/src/funasrruntime.cpp

@@ -480,7 +480,7 @@
        
        audio->Split(vad_online_handle, chunk_len, input_finished, mode);

        funasr::AudioFrame* frame = NULL;
        funasr::AudioFrame* frame = nullptr;
        while(audio->FetchChunck(frame) > 0){
            string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
            if(mode == ASR_ONLINE){
@@ -504,9 +504,9 @@
            }else if(mode == ASR_TWO_PASS){
                p_result->msg += msg;
            }
            if(frame != NULL){
            if(frame != nullptr){
                delete frame;
                frame = NULL;
                frame = nullptr;
            }
        }

@@ -561,9 +561,9 @@
            if (!(p_result->stamp).empty()){
                p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp);
            }
            if(frame != NULL){
            if(frame != nullptr){
                delete frame;
                frame = NULL;
                frame = nullptr;
            }
        }


 runtime/onnxruntime/src/tokenizer.cpp

@@ -53,8 +53,8 @@

        SetJiebaRes(jieba_dict_trie_, jieba_model_);
    }else {
        jieba_dict_trie_ = NULL;
        jieba_model_ = NULL;
        jieba_dict_trie_ = nullptr;
        jieba_model_ = nullptr;
    }
}


 runtime/run_server.sh

@@ -2,20 +2,21 @@
download_model_dir="/workspace/models"
model_dir="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx"
vad_dir="damo/speech_fsmn_vad_zh-cn-16k-common-onnx"
punc_dir="damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx"
itn_dir="thuduj12/fst_itn_zh"
lm_dir="damo/speech_ngram_lm_zh-cn-ai-wesp-fst"
punc_dir=""
itn_dir=""
lm_dir=""
port=10095
certfile="../../../ssl_key/server.crt"
keyfile="../../../ssl_key/server.key"
hotword="../../hotwords.txt"
# set decoder_thread_num
decoder_thread_num=$(cat /proc/cpuinfo | grep "processor"|wc -l) || { echo "Get cpuinfo failed. Set decoder_thread_num = 32"; decoder_thread_num=32; }
decoder_thread_num=8
multiple_io=16
io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io ))
model_thread_num=1
model_thread_num=5

. ../egs/aishell/transformer/utils/parse_options.sh || exit 1;
. ./tools/utils/parse_options.sh || exit 1;

if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then
  certfile=""

 runtime/run_server_2pass.sh

@@ -16,7 +16,7 @@
io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io ))
model_thread_num=1

. ../egs/aishell/transformer/utils/parse_options.sh || exit 1;
. ./tools/utils/parse_options.sh || exit 1;

if [ -z "$certfile" ] || [ "$certfile" = "0" ]; then
  certfile=""

 runtime/tools/utils/parse_options.sh

New file
@@ -0,0 +1,97 @@
#!/usr/bin/env bash

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
#                 Arnab Ghoshal, Karel Vesely

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).


###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###

# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
  if [ "${!argpos}" == "--config" ]; then
    argpos_plus1=$((argpos+1))
    config=${!argpos_plus1}
    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
    . $config  # source the config file.
  fi
done


###
### Now we process the command line options
###
while true; do
  [ -z "${1:-}" ] && break;  # break if there are no arguments
  case "$1" in
    # If the enclosing script is called with --help option, print the help
    # message and exit.  Scripts should put help messages in $help_message
    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
      else printf "$help_message\n" 1>&2 ; fi;
      exit 0 ;;
    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
      exit 1 ;;
    # If the first command-line argument begins with "--" (e.g. --foo-bar),
    # then work out the variable name as $name, which will equal "foo_bar".
    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
      # Next we test whether the variable in question is undefned-- if so it's
      # an invalid option and we die.  Note: $0 evaluates to the name of the
      # enclosing script.
      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
      # is undefined.  We then have to wrap this test inside "eval" because
      # foo_bar is itself inside a variable ($name).
      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

      oldval="`eval echo \\$$name`";
      # Work out whether we seem to be expecting a Boolean argument.
      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
        was_bool=true;
      else
        was_bool=false;
      fi

      # Set the variable to the right value-- the escaped quotes make it work if
      # the option had spaces, like --cmd "queue.pl -sync y"
      eval $name=\"$2\";

      # Check that Boolean-valued arguments are really Boolean.
      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
        exit 1;
      fi
      shift 2;
      ;;
  *) break;
  esac
done


# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


true; # so this script returns exit code 0.

 runtime/websocket/bin/websocket-server-2pass.cpp

@@ -409,7 +409,7 @@
      }

      // hotwords: fst/nn
      if(msg_data->hotwords_embedding == NULL){
      if(msg_data->hotwords_embedding == nullptr){
        std::unordered_map<std::string, int> merged_hws_map;
        std::string nn_hotwords = "";

@@ -458,7 +458,7 @@
        msg_data->msg["audio_fs"] = jsonresult["audio_fs"];
      }
      if (jsonresult.contains("chunk_size")) {
        if (msg_data->tpass_online_handle == NULL) {
        if (msg_data->tpass_online_handle == nullptr) {
          std::vector<int> chunk_size_vec =
              jsonresult["chunk_size"].get<std::vector<int>>();
          // check chunk_size_vec
@@ -480,7 +480,7 @@
      if ((jsonresult["is_speaking"] == false ||
          jsonresult["is_finished"] == true) && 
          msg_data->msg["is_eof"] != true &&
          msg_data->hotwords_embedding != NULL) {
          msg_data->hotwords_embedding != nullptr) {
        LOG(INFO) << "client done";

        // if it is in final message, post the sample_data to decode
@@ -532,7 +532,7 @@

          try{
            // post to decode
            if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != NULL) {
            if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != nullptr) {
              std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
              msg_data->strand_->post(
                        std::bind(&WebSocketServer::do_decoder, this,

 runtime/websocket/bin/websocket-server-2pass.h

@@ -55,13 +55,13 @@
  nlohmann::json msg;
  std::shared_ptr<std::vector<char>> samples;
  std::shared_ptr<std::vector<std::vector<std::string>>> punc_cache;
  std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL;
  std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr;
  std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection
  FUNASR_HANDLE tpass_online_handle=NULL;
  FUNASR_HANDLE tpass_online_handle=nullptr;
  std::string online_res = "";
  std::string tpass_res = "";
  std::shared_ptr<asio::io_context::strand>  strand_; // for data execute in order
  FUNASR_DEC_HANDLE decoder_handle=NULL; 
  FUNASR_DEC_HANDLE decoder_handle=nullptr; 
} FUNASR_MESSAGE;

// See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about
@@ -139,7 +139,7 @@
  asio::io_context& io_decoder_;  // threads for asr decoder
  // std::ofstream fout;
  // FUNASR_HANDLE asr_handle;  // asr engine handle
  FUNASR_HANDLE tpass_handle=NULL;
  FUNASR_HANDLE tpass_handle=nullptr;
  bool isonline = true;  // online or offline engine, now only support offline
  bool is_ssl = true;
  server* server_;          // websocket server

 runtime/websocket/bin/websocket-server.cpp

@@ -77,15 +77,16 @@
      std::string stamp_sents="";
      try{
        FUNASR_RESULT Result = FunOfflineInferBuffer(
            asr_handle, buffer.data(), buffer.size(), RASR_NONE, NULL, 
            asr_handle, buffer.data(), buffer.size(), RASR_NONE, nullptr, 
            hotwords_embedding, audio_fs, wav_format, itn, decoder_handle);
        if (Result != NULL){
        if (Result != nullptr){
          asr_result = FunASRGetResult(Result, 0);  // get decode result
          stamp_res = FunASRGetStamp(Result);
          stamp_sents = FunASRGetStampSents(Result);
          FunASRFreeResult(Result);
        } else{
          LOG(ERROR) << "FUNASR_RESULT is NULL.";
          std::this_thread::sleep_for(std::chrono::milliseconds(20));
          LOG(ERROR) << "FUNASR_RESULT is nullptr.";
        }
      }catch (std::exception const& e) {
        LOG(ERROR) << e.what();
@@ -306,7 +307,7 @@
      }

      // hotwords: fst/nn
      if(msg_data->hotwords_embedding == NULL){
      if(msg_data->hotwords_embedding == nullptr){
        std::unordered_map<std::string, int> merged_hws_map;
        std::string nn_hotwords = "";

@@ -359,7 +360,7 @@
      if ((jsonresult["is_speaking"] == false ||
          jsonresult["is_finished"] == true) && 
          msg_data->msg["is_eof"] != true && 
          msg_data->hotwords_embedding != NULL) {
          msg_data->hotwords_embedding != nullptr) {
        LOG(INFO) << "client done";
        // for offline, send all receive data to decoder engine
        std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));

 runtime/websocket/bin/websocket-server.h

@@ -58,9 +58,9 @@
typedef struct {
  nlohmann::json msg;
  std::shared_ptr<std::vector<char>> samples;
  std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL;
  std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr;
  std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection
  FUNASR_DEC_HANDLE decoder_handle=NULL;
  FUNASR_DEC_HANDLE decoder_handle=nullptr;
} FUNASR_MESSAGE;

// See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about

			@@ -61,7 +61,6 @@
			\| paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) \| speech recognition with speaker diarization, with timestamps, non-streaming \| 60000 hours, Mandarin \| 220M \|
			\| <nobr>paraformer-zh-online <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() )</nobr> \| speech recognition, streaming \| 60000 hours, Mandarin \| 220M \|
			\| paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) \| speech recognition, with timestamps, non-streaming \| 50000 hours, English \| 220M \|
			\| paraformer-en-spk <br> ([⭐]()[🤗]() ) \| speech recognition with speaker diarization, non-streaming \| Undo \| Undo \|
			\| conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) \| speech recognition, non-streaming \| 50000 hours, English \| 220M \|
			\| ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) \| punctuation restoration \| 100M, Mandarin and English \| 1.1G \|
			\| fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) \| voice activity detection \| 5000 hours, Mandarin and English \| 0.4M \|
			@@ -97,7 +96,7 @@
			punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
			spk_model="cam++", spk_model_revision="v2.0.2")
			res = model(input=f"{model.model_path}/example/asr_example.wav",
			batch_size=16,
			batch_size=64,
			hotword='魔搭')
			print(res)
			```
			@@ -135,7 +134,6 @@
			from funasr import AutoModel

			model = AutoModel(model="fsmn-vad", model_revision="v2.0.2")

			wav_file = f"{model.model_path}/example/asr_example.wav"
			res = model(input=wav_file)
			print(res)
			@@ -167,7 +165,6 @@
			from funasr import AutoModel

			model = AutoModel(model="ct-punc", model_revision="v2.0.2")

			res = model(input="那今天的会就到这里吧 happy new year 明年见")
			print(res)
			```
			@@ -176,9 +173,8 @@
			from funasr import AutoModel

			model = AutoModel(model="fa-zh", model_revision="v2.0.2")

			wav_file = f"{model.model_path}/example/asr_example.wav"
			text_file = f"{model.model_path}/example/asr_example.wav"
			text_file = f"{model.model_path}/example/text.txt"
			res = model(input=(wav_file, text_file), data_type=("sound", "text"))
			print(res)
			```

			@@ -60,14 +60,13 @@
			\| 模型名字 \| 任务详情 \| 训练数据 \| 参数量 \|
			\|:------------------------------------------------------------------------------------------------------------------------------------------------------------:\|:------------------:\|:------------:\|:----:\|
			\| paraformer-zh <br> ([⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) [🤗]() ) \| 语音识别，带时间戳输出，非实时 \| 60000小时，中文 \| 220M \|
			\| paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) \| 分角色语音识别，带时间戳输出，非实时 \| 60000小时，中文 \| 220M \|
			\| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() ) \| 语音识别，实时 \| 60000小时，中文 \| 220M \|
			\| paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) \| 语音识别，非实时 \| 50000小时，英文 \| 220M \|
			\| paraformer-en-spk <br> ([⭐]() [🤗]() ) \| 语音识别，非实时 \| 50000小时，英文 \| 220M \|
			\| conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) \| 语音识别，非实时 \| 50000小时，英文 \| 220M \|
			\| ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) \| 标点恢复 \| 100M，中文与英文 \| 1.1G \|
			\| fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) \| 语音端点检测，实时 \| 5000小时，中文与英文 \| 0.4M \|
			\| fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() ) \| 字级别时间戳预测 \| 50000小时，中文 \| 38M \|
			\| paraformer-zh-spk <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/summary) [🤗]() ) \| 分角色语音识别，带时间戳输出，非实时 \| 60000小时，中文 \| 220M \|
			\| paraformer-zh-streaming <br> ( [⭐](https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [🤗]() ) \| 语音识别，实时 \| 60000小时，中文 \| 220M \|
			\| paraformer-en <br> ( [⭐](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [🤗]() ) \| 语音识别，非实时 \| 50000小时，英文 \| 220M \|
			\| conformer-en <br> ( [⭐](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [🤗]() ) \| 语音识别，非实时 \| 50000小时，英文 \| 220M \|
			\| ct-punc <br> ( [⭐](https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [🤗]() ) \| 标点恢复 \| 100M，中文与英文 \| 1.1G \|
			\| fsmn-vad <br> ( [⭐](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [🤗]() ) \| 语音端点检测，实时 \| 5000小时，中文与英文 \| 0.4M \|
			\| fa-zh <br> ( [⭐](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [🤗]() ) \| 字级别时间戳预测 \| 50000小时，中文 \| 38M \|


			<a name="快速开始"></a>
			@@ -86,12 +85,15 @@
			### 非实时语音识别
			```python
			from funasr import AutoModel

			model = AutoModel(model="paraformer-zh")
			# for the long duration wav, you could add vad model
			# model = AutoModel(model="paraformer-zh", vad_model="fsmn-vad", punc_model="ct-punc")

			res = model(input="asr_example_zh.wav", batch_size=64)
			# paraformer-zh is a multi-functional asr model
			# use vad, punc, spk or not as you need
			model = AutoModel(model="paraformer-zh", model_revision="v2.0.2", \
			vad_model="fsmn-vad", vad_model_revision="v2.0.2", \
			punc_model="ct-punc-c", punc_model_revision="v2.0.2", \
			spk_model="cam++", spk_model_revision="v2.0.2")
			res = model(input=f"{model.model_path}/example/asr_example.wav",
			batch_size=64,
			hotword='魔搭')
			print(res)
			```
			注：`model_hub`：表示模型仓库，`ms`为选择modelscope下载，`hf`为选择huggingface下载。
			@@ -105,7 +107,7 @@
			encoder_chunk_look_back = 4 #number of chunks to lookback for encoder self-attention
			decoder_chunk_look_back = 1 #number of encoder chunks to lookback for decoder cross-attention

			model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.0")
			model = AutoModel(model="paraformer-zh-streaming", model_revision="v2.0.2")

			import soundfile
			import os
			@@ -163,7 +165,7 @@
			```python
			from funasr import AutoModel

			model = AutoModel(model="ct-punc", model_revision="v2.0.1")
			model = AutoModel(model="ct-punc", model_revision="v2.0.2")

			res = model(input="那今天的会就到这里吧 happy new year 明年见")
			print(res)
			@@ -176,7 +178,7 @@
			model = AutoModel(model="fa-zh", model_revision="v2.0.0")

			wav_file = f"{model.model_path}/example/asr_example.wav"
			text_file = f"{model.model_path}/example/asr_example.wav"
			text_file = f"{model.model_path}/example/text.txt"
			res = model(input=(wav_file, text_file), data_type=("sound", "text"))
			print(res)
			```

			@@ -144,10 +144,10 @@
			} else {
			is_final = false;
			}
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle, speech_buff+sample_offset, step, punc_cache, is_final,
			sampling_rate_, "pcm", (ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle);
			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			n_total_time += taking_micros;
			@@ -272,7 +272,7 @@
			GetValue(asr_mode, ASR_MODE, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = onnx_thread.getValue();
			int asr_mode_ = -1;
			if(model_path[ASR_MODE] == "offline"){
			@@ -301,7 +301,7 @@
			am_sc = am_scale.getValue();
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

			@@ -97,7 +97,7 @@
			GetValue(asr_mode, ASR_MODE, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = onnx_thread.getValue();
			int asr_mode_ = -1;
			if(model_path[ASR_MODE] == "offline"){
			@@ -128,7 +128,7 @@
			// init wfst decoder
			FUNASR_DEC_HANDLE decoder_handle = FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, glob_beam, lat_beam, am_sc);

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -214,11 +214,11 @@
			} else {
			is_final = false;
			}
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result = FunTpassInferBuffer(tpass_handle, tpass_online_handle,
			speech_buff+sample_offset, step, punc_cache, is_final, sampling_rate_, "pcm",
			(ASR_TYPE)asr_mode_, hotwords_embedding, true, decoder_handle);
			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -49,7 +49,7 @@
			GetValue(txt_path, TXT_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num);

			@@ -59,7 +59,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -83,9 +83,9 @@

			long taking_micros = 0;
			for(auto& txt_str : txt_list){
			gettimeofday(&start, NULL);
			FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, NULL);
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, nullptr);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			string msg = FunASRGetResult(result, 0);

			@@ -54,7 +54,7 @@
			// warm up
			for (size_t i = 0; i < 1; i++)
			{
			FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle);
			FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle);
			if(result){
			FunASRFreeResult(result);
			}
			@@ -67,10 +67,10 @@
			break;
			}

			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs, true, decoder_handle);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result=FunOfflineInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs, true, decoder_handle);

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			n_total_time += taking_micros;
			@@ -115,10 +115,8 @@

			void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
			{
			if (value_arg.isSet()){
			model_path.insert({key, value_arg.getValue()});
			LOG(INFO)<< key << " : " << value_arg.getValue();
			}
			model_path.insert({key, value_arg.getValue()});
			LOG(INFO)<< key << " : " << value_arg.getValue();
			}

			int main(int argc, char *argv[])
			@@ -176,7 +174,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_HANDLE asr_handle=FunOfflineInit(model_path, 1);

			if (!asr_handle)
			@@ -185,7 +183,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

			@@ -82,7 +82,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num);

			@@ -92,7 +92,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -132,9 +132,9 @@
			for (int i = 0; i < wav_list.size(); i++) {
			auto& wav_file = wav_list[i];
			auto& wav_id = wav_ids[i];
			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), NULL, audio_fs.getValue());
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), nullptr, audio_fs.getValue());
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -32,10 +32,8 @@

			void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
			{
			if (value_arg.isSet()){
			model_path.insert({key, value_arg.getValue()});
			LOG(INFO)<< key << " : " << value_arg.getValue();
			}
			model_path.insert({key, value_arg.getValue()});
			LOG(INFO)<< key << " : " << value_arg.getValue();
			}

			int main(int argc, char** argv)
			@@ -89,7 +87,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE asr_hanlde=FunOfflineInit(model_path, thread_num);

			@@ -116,7 +114,7 @@
			LOG(INFO) << "hotword path: " << hotword_path;
			funasr::ExtractHws(hotword_path, hws_map, nn_hotwords_);

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -158,9 +156,9 @@
			for (int i = 0; i < wav_list.size(); i++) {
			auto& wav_file = wav_list[i];
			auto& wav_id = wav_ids[i];
			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, hotwords_embedding, audio_fs.getValue(), true, decoder_handle);
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, nullptr, hotwords_embedding, audio_fs.getValue(), true, decoder_handle);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -63,7 +63,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE asr_handle=FunASRInit(model_path, thread_num, ASR_ONLINE);

			@@ -73,7 +73,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -144,9 +144,9 @@
			} else {
			is_final = false;
			}
			gettimeofday(&start, NULL);
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -69,7 +69,7 @@
			GetValue(txt_path, TXT_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num, PUNC_ONLINE);

			@@ -79,7 +79,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -107,16 +107,16 @@
			splitString(vad_strs, txt_str, "\|");
			string str_out;
			FUNASR_RESULT result = nullptr;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			for(auto& vad_str:vad_strs){
			result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, NULL, PUNC_ONLINE, result);
			result=CTTransformerInfer(punc_hanlde, vad_str.c_str(), RASR_NONE, nullptr, PUNC_ONLINE, result);
			if(result){
			string msg = CTTransformerGetResult(result, 0);
			str_out += msg;
			LOG(INFO)<<"Online result: "<<msg;
			}
			}
			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO)<<"Results: "<<str_out;

			@@ -84,7 +84,7 @@
			} else {
			is_final = false;
			}
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
			if (result)
			{
			FunASRFreeResult(result);
			@@ -130,9 +130,9 @@
			} else {
			is_final = false;
			}
			gettimeofday(&start, NULL);
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, NULL, is_final, sampling_rate_);
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result = FunASRInferBuffer(online_handle, speech_buff+sample_offset, step, RASR_NONE, nullptr, is_final, sampling_rate_);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			n_total_time += taking_micros;
			@@ -210,7 +210,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_HANDLE asr_handle=FunASRInit(model_path, 1, ASR_ONLINE);

			if (!asr_handle)
			@@ -219,7 +219,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

			@@ -89,7 +89,7 @@
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			int thread_num = 1;
			FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num);

			@@ -99,7 +99,7 @@
			exit(-1);
			}

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -170,9 +170,9 @@
			} else {
			is_final = false;
			}
			gettimeofday(&start, NULL);
			FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, NULL, is_final, sampling_rate_);
			gettimeofday(&end, NULL);
			gettimeofday(&start, nullptr);
			FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, nullptr, is_final, sampling_rate_);
			gettimeofday(&end, nullptr);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -6,8 +6,8 @@
			void *p1; // original block
			void **p2; // aligned block
			int offset = alignment - 1 + sizeof(void *);
			if ((p1 = (void *)malloc(required_bytes + offset)) == NULL) {
			return NULL;
			if ((p1 = (void *)malloc(required_bytes + offset)) == nullptr) {
			return nullptr;
			}
			p2 = (void **)(((size_t)(p1) + offset) & ~(alignment - 1));
			p2[-1] = p1;

			@@ -133,6 +133,7 @@
			};
			~AudioWindow(){
			free(window);
			window = nullptr;
			};
			int put(int val)
			{
			@@ -160,8 +161,9 @@
			len = end - start;
			}
			AudioFrame::~AudioFrame(){
			if(data != NULL){
			if(data != nullptr){
			free(data);
			data = nullptr;
			}
			}
			int AudioFrame::SetStart(int val)
			@@ -195,38 +197,41 @@

			Audio::Audio(int data_type) : dest_sample_rate(MODEL_SAMPLE_RATE), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			speech_buff = nullptr;
			speech_data = nullptr;
			align_size = 1360;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::Audio(int model_sample_rate, int data_type) : dest_sample_rate(model_sample_rate), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			speech_buff = nullptr;
			speech_data = nullptr;
			align_size = 1360;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::Audio(int model_sample_rate, int data_type, int size) : dest_sample_rate(model_sample_rate), data_type(data_type)
			{
			speech_buff = NULL;
			speech_data = NULL;
			speech_buff = nullptr;
			speech_data = nullptr;
			align_size = (float)size;
			seg_sample = dest_sample_rate / 1000;
			}

			Audio::~Audio()
			{
			if (speech_buff != NULL) {
			if (speech_buff != nullptr) {
			free(speech_buff);
			speech_buff = nullptr;
			}
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_char != NULL) {
			if (speech_char != nullptr) {
			free(speech_char);
			speech_char = nullptr;
			}
			ClearQueue(frame_queue);
			ClearQueue(asr_online_queue);
			@@ -269,8 +274,9 @@
			resampler->Resample(waveform, n, true, &samples);
			//reset speech_data
			speech_len = samples.size();
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			speech_data = (float)malloc(sizeof(float) speech_len);
			memset(speech_data, 0, sizeof(float) * speech_len);
			@@ -283,21 +289,21 @@
			#else
			// from file
			AVFormatContext* formatContext = avformat_alloc_context();
			if (avformat_open_input(&formatContext, filename, NULL, NULL) != 0) {
			if (avformat_open_input(&formatContext, filename, nullptr, nullptr) != 0) {
			LOG(ERROR) << "Error: Could not open input file.";
			avformat_close_input(&formatContext);
			avformat_free_context(formatContext);
			return false;
			}

			if (avformat_find_stream_info(formatContext, NULL) < 0) {
			if (avformat_find_stream_info(formatContext, nullptr) < 0) {
			LOG(ERROR) << "Error: Could not open input file.";
			avformat_close_input(&formatContext);
			avformat_free_context(formatContext);
			return false;
			}
			const AVCodec* codec = NULL;
			AVCodecParameters* codecParameters = NULL;
			const AVCodec* codec = nullptr;
			AVCodecParameters* codecParameters = nullptr;
			int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
			if (audioStreamIndex >= 0) {
			codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
			@@ -321,7 +327,7 @@
			avcodec_free_context(&codecContext);
			return false;
			}
			if (avcodec_open2(codecContext, codec, NULL) < 0) {
			if (avcodec_open2(codecContext, codec, nullptr) < 0) {
			LOG(ERROR) << "Error: Could not open audio decoder.";
			avformat_close_input(&formatContext);
			avformat_free_context(formatContext);
			@@ -400,14 +406,13 @@
			av_packet_free(&packet);
			av_frame_free(&frame);

			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			free(speech_buff);
			}
			if (speech_char != NULL) {
			if (speech_char != nullptr) {
			free(speech_char);
			speech_char = nullptr;
			}
			offset = 0;

			@@ -418,30 +423,25 @@
			}

			speech_len = (resampled_buffers.size()) / 2;
			speech_buff = (int16_t)malloc(sizeof(int16_t) speech_len);
			if (speech_buff)
			{
			memset(speech_buff, 0, sizeof(int16_t) * speech_len);
			memcpy((void)speech_buff, (const void)resampled_buffers.data(), speech_len * sizeof(int16_t));

			speech_data = (float)malloc(sizeof(float) speech_len);
			speech_data = (float)malloc(sizeof(float) speech_len);
			if(speech_data){
			memset(speech_data, 0, sizeof(float) * speech_len);

			float scale = 1;
			if (data_type == 1) {
			scale = 32768;
			scale = 32768.0f;
			}
			for (int32_t i = 0; i != speech_len; ++i) {
			speech_data[i] = (float)speech_buff[i] / scale;
			for (int32_t i = 0; i < speech_len; ++i) {
			int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) \| resampled_buffers[2 * i]);
			speech_data[i] = (float)val / scale;
			}

			AudioFrame* frame = new AudioFrame(speech_len);
			frame_queue.push(frame);

			return true;
			}
			else
			}else{
			return false;
			}

			#endif
			}

			@@ -468,7 +468,7 @@
			}
			AVFormatContext* formatContext = avformat_alloc_context();
			formatContext->pb = avio_ctx;
			if (avformat_open_input(&formatContext, "", NULL, NULL) != 0) {
			if (avformat_open_input(&formatContext, "", nullptr, nullptr) != 0) {
			LOG(ERROR) << "Error: Could not open input file.";
			avio_context_free(&avio_ctx);
			avformat_close_input(&formatContext);
			@@ -476,15 +476,15 @@
			return false;
			}

			if (avformat_find_stream_info(formatContext, NULL) < 0) {
			if (avformat_find_stream_info(formatContext, nullptr) < 0) {
			LOG(ERROR) << "Error: Could not find stream information.";
			avio_context_free(&avio_ctx);
			avformat_close_input(&formatContext);
			avformat_free_context(formatContext);
			return false;
			}
			const AVCodec* codec = NULL;
			AVCodecParameters* codecParameters = NULL;
			const AVCodec* codec = nullptr;
			AVCodecParameters* codecParameters = nullptr;
			int audioStreamIndex = av_find_best_stream(formatContext, AVMEDIA_TYPE_AUDIO, -1, -1, &codec, 0);
			if (audioStreamIndex >= 0) {
			codecParameters = formatContext->streams[audioStreamIndex]->codecpar;
			@@ -505,7 +505,7 @@
			avcodec_free_context(&codecContext);
			return false;
			}
			if (avcodec_open2(codecContext, codec, NULL) < 0) {
			if (avcodec_open2(codecContext, codec, nullptr) < 0) {
			LOG(ERROR) << "Error: Could not open audio decoder.";
			avio_context_free(&avio_ctx);
			avformat_close_input(&formatContext);
			@@ -590,39 +590,31 @@
			av_packet_free(&packet);
			av_frame_free(&frame);

			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			free(speech_buff);
			}
			offset = 0;

			speech_len = (resampled_buffers.size()) / 2;
			speech_buff = (int16_t)malloc(sizeof(int16_t) speech_len);
			if (speech_buff)
			{
			memset(speech_buff, 0, sizeof(int16_t) * speech_len);
			memcpy((void)speech_buff, (const void)resampled_buffers.data(), speech_len * sizeof(int16_t));

			speech_data = (float)malloc(sizeof(float) speech_len);
			speech_data = (float)malloc(sizeof(float) speech_len);
			if(speech_data){
			memset(speech_data, 0, sizeof(float) * speech_len);

			float scale = 1;
			if (data_type == 1) {
			scale = 32768;
			scale = 32768.0f;
			}
			for (int32_t i = 0; i != speech_len; ++i) {
			speech_data[i] = (float)speech_buff[i] / scale;
			for (int32_t i = 0; i < speech_len; ++i) {
			int16_t val = (int16_t)((resampled_buffers[2 * i + 1] << 8) \| resampled_buffers[2 * i]);
			speech_data[i] = (float)val / scale;
			}

			AudioFrame* frame = new AudioFrame(speech_len);
			frame_queue.push(frame);

			return true;
			}
			else
			}else{
			return false;
			}

			#endif
			}

			@@ -630,11 +622,13 @@
			bool Audio::LoadWav(const char filename, int32_t sampling_rate, bool resample)
			{
			WaveHeader header;
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			if (speech_buff != nullptr) {
			free(speech_buff);
			speech_buff = nullptr;
			}

			offset = 0;
			@@ -705,8 +699,9 @@
			bool Audio::LoadWav2Char(const char filename, int32_t sampling_rate)
			{
			WaveHeader header;
			if (speech_char != NULL) {
			if (speech_char != nullptr) {
			free(speech_char);
			speech_char = nullptr;
			}
			offset = 0;
			std::ifstream is(filename, std::ifstream::binary);
			@@ -744,13 +739,14 @@
			bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
			{
			WaveHeader header;
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			if (speech_buff != nullptr) {
			free(speech_buff);
			speech_buff = nullptr;
			}
			offset = 0;

			std::memcpy(&header, buf, sizeof(header));

			@@ -790,33 +786,24 @@

			bool Audio::LoadPcmwav(const char* buf, int n_buf_len, int32_t* sampling_rate)
			{
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			free(speech_buff);
			}
			offset = 0;

			speech_len = n_buf_len / 2;
			speech_buff = (int16_t)malloc(sizeof(int16_t) speech_len);
			if (speech_buff)
			{
			memset(speech_buff, 0, sizeof(int16_t) * speech_len);
			memcpy((void)speech_buff, (const void)buf, speech_len * sizeof(int16_t));

			speech_data = (float)malloc(sizeof(float) speech_len);
			memset(speech_data, 0, sizeof(float) * speech_len);

			speech_data = (float)malloc(sizeof(float) speech_len);
			if(speech_data){
			float scale = 1;
			if (data_type == 1) {
			scale = 32768;
			scale = 32768.0f;
			}
			const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf);
			for (int32_t i = 0; i < speech_len; ++i) {
			int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) \| byte_buf[2 * i]);
			speech_data[i] = (float)val / scale;
			}

			for (int32_t i = 0; i != speech_len; ++i) {
			speech_data[i] = (float)speech_buff[i] / scale;
			}

			//resample
			if(*sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			@@ -824,44 +811,33 @@

			AudioFrame* frame = new AudioFrame(speech_len);
			frame_queue.push(frame);

			return true;

			}
			else
			}else{
			return false;
			}
			}

			bool Audio::LoadPcmwavOnline(const char* buf, int n_buf_len, int32_t* sampling_rate)
			{
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			}
			if (speech_buff != NULL) {
			free(speech_buff);
			}
			if (speech_char != NULL) {
			free(speech_char);
			speech_data = nullptr;
			}

			speech_len = n_buf_len / 2;
			speech_buff = (int16_t)malloc(sizeof(int16_t) speech_len);
			if (speech_buff)
			{
			memset(speech_buff, 0, sizeof(int16_t) * speech_len);
			memcpy((void)speech_buff, (const void)buf, speech_len * sizeof(int16_t));

			speech_data = (float)malloc(sizeof(float) speech_len);
			memset(speech_data, 0, sizeof(float) * speech_len);

			speech_data = (float)malloc(sizeof(float) speech_len);
			if(speech_data){
			float scale = 1;
			if (data_type == 1) {
			scale = 32768;
			scale = 32768.0f;
			}
			const uint8_t* byte_buf = reinterpret_cast<const uint8_t*>(buf);
			for (int32_t i = 0; i < speech_len; ++i) {
			int16_t val = (int16_t)((byte_buf[2 * i + 1] << 8) \| byte_buf[2 * i]);
			speech_data[i] = (float)val / scale;
			}

			for (int32_t i = 0; i != speech_len; ++i) {
			speech_data[i] = (float)speech_buff[i] / scale;
			}

			//resample
			if(*sampling_rate != dest_sample_rate){
			WavResample(*sampling_rate, speech_data, speech_len);
			@@ -873,20 +849,22 @@

			AudioFrame* frame = new AudioFrame(speech_len);
			frame_queue.push(frame);

			return true;

			}
			else
			}else{
			return false;
			}
			}

			bool Audio::LoadPcmwav(const char* filename, int32_t* sampling_rate, bool resample)
			{
			if (speech_data != NULL) {
			if (speech_data != nullptr) {
			free(speech_data);
			speech_data = nullptr;
			}
			if (speech_buff != NULL) {
			if (speech_buff != nullptr) {
			free(speech_buff);
			speech_buff = nullptr;
			}
			offset = 0;

			@@ -937,8 +915,9 @@

			bool Audio::LoadPcmwav2Char(const char* filename, int32_t* sampling_rate)
			{
			if (speech_char != NULL) {
			if (speech_char != nullptr) {
			free(speech_char);
			speech_char = nullptr;
			}
			offset = 0;

			@@ -964,8 +943,9 @@

			bool Audio::LoadOthers2Char(const char* filename)
			{
			if (speech_char != NULL) {
			if (speech_char != nullptr) {
			free(speech_char);
			speech_char = nullptr;
			}

			FILE* fp;
			@@ -1070,6 +1050,7 @@
			new_data[tmp_off + i] = speech_data[ii];
			}
			free(speech_data);
			speech_data = nullptr;
			speech_data = new_data;
			speech_len = num_new_samples;

			@@ -1088,7 +1069,7 @@
			frame_queue.pop();
			int sp_len = frame->GetLen();
			delete frame;
			frame = NULL;
			frame = nullptr;

			std::vector<float> pcm_data(speech_data, speech_data+sp_len);
			vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
			@@ -1100,7 +1081,7 @@
			frame->SetStart(start);
			frame->SetEnd(end);
			frame_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}
			}

			@@ -1112,7 +1093,7 @@
			frame_queue.pop();
			int sp_len = frame->GetLen();
			delete frame;
			frame = NULL;
			frame = nullptr;

			std::vector<float> pcm_data(speech_data, speech_data+sp_len);
			vad_segments = vad_obj->Infer(pcm_data, input_finished);
			@@ -1127,7 +1108,7 @@
			frame_queue.pop();
			int sp_len = frame->GetLen();
			delete frame;
			frame = NULL;
			frame = nullptr;

			std::vector<float> pcm_data(speech_data, speech_data+sp_len);
			vector<std::vector<int>> vad_segments = vad_obj->Infer(pcm_data, input_finished);
			@@ -1148,7 +1129,7 @@
			frame->data = (float)malloc(sizeof(float) step);
			memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
			asr_online_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			speech_start += step/seg_sample;
			}
			}
			@@ -1176,7 +1157,7 @@
			frame->data = (float)malloc(sizeof(float) (end-start));
			memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
			asr_online_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}

			if(asr_mode != ASR_ONLINE){
			@@ -1187,7 +1168,7 @@
			frame->data = (float)malloc(sizeof(float) (end-start));
			memcpy(frame->data, all_samples.data()+start-offset, (end-start)*sizeof(float));
			asr_offline_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}

			speech_start = -1;
			@@ -1210,7 +1191,7 @@
			frame->data = (float)malloc(sizeof(float) step);
			memcpy(frame->data, all_samples.data()+start-offset, step*sizeof(float));
			asr_online_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			speech_start += step/seg_sample;
			}
			}
			@@ -1235,7 +1216,7 @@
			frame->data = (float)malloc(sizeof(float) (end-offline_start));
			memcpy(frame->data, all_samples.data()+offline_start-offset, (end-offline_start)*sizeof(float));
			asr_offline_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}

			if(asr_mode != ASR_OFFLINE){
			@@ -1253,7 +1234,7 @@
			frame->data = (float)malloc(sizeof(float) step);
			memcpy(frame->data, all_samples.data()+start-offset+sample_offset, step*sizeof(float));
			asr_online_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}
			}else{
			frame = new AudioFrame(0);
			@@ -1261,7 +1242,7 @@
			frame->global_start = speech_start; // in this case start >= end
			frame->global_end = speech_end_i;
			asr_online_queue.push(frame);
			frame = NULL;
			frame = nullptr;
			}
			}
			speech_start = -1;

			@@ -48,7 +48,7 @@
			std::vector<std::vector<int>> split_id_vec;

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);

			LoadCfgFromYaml(cfg_file.c_str(), opt_);
			while (getline(ifs_hws, line)) {
			@@ -86,7 +86,7 @@
			BuildGraph(split_id_vec, custom_weight);
			ifs_hws.close();

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";
			@@ -99,7 +99,7 @@
			std::vector<std::vector<int>> split_id_vec;

			struct timeval start, end;
			gettimeofday(&start, NULL);
			gettimeofday(&start, nullptr);
			opt_.incre_bias_ = inc_bias;
			for (const pair<string, int>& kv : hws_map) {
			float score = 1.0f;
			@@ -128,7 +128,7 @@
			}
			BuildGraph(split_id_vec, custom_weight);

			gettimeofday(&end, NULL);
			gettimeofday(&end, nullptr);
			long seconds = (end.tv_sec - start.tv_sec);
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Build bias lm takes " << (double)modle_init_micros / 1000000 << " s";

			@@ -441,7 +441,7 @@
			}

			bool EncodeConverter::IsAllChineseCharactor(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}

			@@ -458,7 +458,7 @@
			}

			bool EncodeConverter::HasAlpha(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			for (size_t i = 0; i < ilen; i++) {
			@@ -471,7 +471,7 @@


			bool EncodeConverter::IsAllAlpha(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			for (size_t i = 0; i < ilen; i++) {
			@@ -483,7 +483,7 @@
			}

			bool EncodeConverter::IsAllAlphaAndPunct(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			bool flag1 = HasAlpha(pu8, ilen);
			@@ -500,7 +500,7 @@
			}

			bool EncodeConverter::IsAllAlphaAndDigit(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			bool flag1 = HasAlpha(pu8, ilen);
			@@ -516,7 +516,7 @@
			return true;
			}
			bool EncodeConverter::IsAllAlphaAndDigitAndBlank(const U8CHAR_T* pu8, size_t ilen) {
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			for (size_t i = 0; i < ilen; i++) {
			@@ -529,7 +529,7 @@
			bool EncodeConverter::NeedAddTailBlank(std::string str) {
			U8CHAR_T pu8 = (U8CHAR_T)str.data();
			size_t ilen = str.size();
			if (pu8 == NULL \|\| ilen <= 0) {
			if (pu8 == nullptr \|\| ilen <= 0) {
			return false;
			}
			if (IsAllAlpha(pu8, ilen) \|\| IsAllAlphaAndPunct(pu8, ilen) \|\| IsAllAlphaAndDigit(pu8, ilen)) {

			@@ -88,15 +88,15 @@
			#ifdef _MSC_VER
			// convert to the local ansi page
			static std::string UTF8ToLocaleAnsi(const std::string& strUTF8) {
			int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
			int len = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, nullptr, 0);
			unsigned short*wszGBK = new unsigned short[len + 1];
			memset(wszGBK, 0, len * 2 + 2);
			MultiByteToWideChar(CP_UTF8, 0, (LPCCH)strUTF8.c_str(), -1, (LPWSTR)wszGBK, len);

			len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, NULL, 0, NULL, NULL);
			len = WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, nullptr, 0, nullptr, nullptr);
			char *szGBK = new char[len + 1];
			memset(szGBK, 0, len + 1);
			WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, NULL, NULL);
			WideCharToMultiByte(CP_ACP, 0, (LPCWCH)wszGBK, -1, szGBK, len, nullptr, nullptr);
			std::string strTemp(szGBK);
			delete[]szGBK;
			delete[]wszGBK;

			@@ -480,7 +480,7 @@

			audio->Split(vad_online_handle, chunk_len, input_finished, mode);

			funasr::AudioFrame* frame = NULL;
			funasr::AudioFrame* frame = nullptr;
			while(audio->FetchChunck(frame) > 0){
			string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
			if(mode == ASR_ONLINE){
			@@ -504,9 +504,9 @@
			}else if(mode == ASR_TWO_PASS){
			p_result->msg += msg;
			}
			if(frame != NULL){
			if(frame != nullptr){
			delete frame;
			frame = NULL;
			frame = nullptr;
			}
			}

			@@ -561,9 +561,9 @@
			if (!(p_result->stamp).empty()){
			p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp);
			}
			if(frame != NULL){
			if(frame != nullptr){
			delete frame;
			frame = NULL;
			frame = nullptr;
			}
			}

			@@ -53,8 +53,8 @@

			SetJiebaRes(jieba_dict_trie_, jieba_model_);
			}else {
			jieba_dict_trie_ = NULL;
			jieba_model_ = NULL;
			jieba_dict_trie_ = nullptr;
			jieba_model_ = nullptr;
			}
			}

			@@ -2,20 +2,21 @@
			download_model_dir="/workspace/models"
			model_dir="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx"
			vad_dir="damo/speech_fsmn_vad_zh-cn-16k-common-onnx"
			punc_dir="damo/punc_ct-transformer_cn-en-common-vocab471067-large-onnx"
			itn_dir="thuduj12/fst_itn_zh"
			lm_dir="damo/speech_ngram_lm_zh-cn-ai-wesp-fst"
			punc_dir=""
			itn_dir=""
			lm_dir=""
			port=10095
			certfile="../../../ssl_key/server.crt"
			keyfile="../../../ssl_key/server.key"
			hotword="../../hotwords.txt"
			# set decoder_thread_num
			decoder_thread_num=$(cat /proc/cpuinfo \| grep "processor"\|wc -l) \|\| { echo "Get cpuinfo failed. Set decoder_thread_num = 32"; decoder_thread_num=32; }
			decoder_thread_num=8
			multiple_io=16
			io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io ))
			model_thread_num=1
			model_thread_num=5

			. ../egs/aishell/transformer/utils/parse_options.sh \|\| exit 1;
			. ./tools/utils/parse_options.sh \|\| exit 1;

			if [ -z "$certfile" ] \|\| [ "$certfile" = "0" ]; then
			certfile=""

			@@ -16,7 +16,7 @@
			io_thread_num=$(( (decoder_thread_num + multiple_io - 1) / multiple_io ))
			model_thread_num=1

			. ../egs/aishell/transformer/utils/parse_options.sh \|\| exit 1;
			. ./tools/utils/parse_options.sh \|\| exit 1;

			if [ -z "$certfile" ] \|\| [ "$certfile" = "0" ]; then
			certfile=""

New file
			@@ -0,0 +1,97 @@
			#!/usr/bin/env bash

			# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
			# Arnab Ghoshal, Karel Vesely

			# Licensed under the Apache License, Version 2.0 (the "License");
			# you may not use this file except in compliance with the License.
			# You may obtain a copy of the License at
			#
			# http://www.apache.org/licenses/LICENSE-2.0
			#
			# THIS CODE IS PROVIDED AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
			# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
			# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
			# MERCHANTABLITY OR NON-INFRINGEMENT.
			# See the Apache 2 License for the specific language governing permissions and
			# limitations under the License.


			# Parse command-line options.
			# To be sourced by another script (as in ". parse_options.sh").
			# Option format is: --option-name arg
			# and shell variable "option_name" gets set to value "arg."
			# The exception is --help, which takes no arguments, but prints the
			# $help_message variable (if defined).


			###
			### The --config file options have lower priority to command line
			### options, so we need to import them first...
			###

			# Now import all the configs specified by command-line, in left-to-right order
			for ((argpos=1; argpos<$#; argpos++)); do
			if [ "${!argpos}" == "--config" ]; then
			argpos_plus1=$((argpos+1))
			config=${!argpos_plus1}
			[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
			. $config # source the config file.
			fi
			done


			###
			### Now we process the command line options
			###
			while true; do
			[ -z "${1:-}" ] && break; # break if there are no arguments
			case "$1" in
			# If the enclosing script is called with --help option, print the help
			# message and exit. Scripts should put help messages in $help_message
			--help\|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
			else printf "$help_message\n" 1>&2 ; fi;
			exit 0 ;;
			--=) echo "$0: options to scripts must be of the form --name value, got '$1'"
			exit 1 ;;
			# If the first command-line argument begins with "--" (e.g. --foo-bar),
			# then work out the variable name as $name, which will equal "foo_bar".
			--*) name=`echo "$1" \| sed s/^--// \| sed s/-/_/g`;
			# Next we test whether the variable in question is undefned-- if so it's
			# an invalid option and we die. Note: $0 evaluates to the name of the
			# enclosing script.
			# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
			# is undefined. We then have to wrap this test inside "eval" because
			# foo_bar is itself inside a variable ($name).
			eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;

			oldval="`eval echo \\$$name`";
			# Work out whether we seem to be expecting a Boolean argument.
			if [ "$oldval" == "true" ] \|\| [ "$oldval" == "false" ]; then
			was_bool=true;
			else
			was_bool=false;
			fi

			# Set the variable to the right value-- the escaped quotes make it work if
			# the option had spaces, like --cmd "queue.pl -sync y"
			eval $name=\"$2\";

			# Check that Boolean-valued arguments are really Boolean.
			if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
			echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
			exit 1;
			fi
			shift 2;
			;;
			*) break;
			esac
			done


			# Check for an empty argument to the --cmd option, which can easily occur as a
			# result of scripting errors.
			[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;


			true; # so this script returns exit code 0.

			@@ -409,7 +409,7 @@
			}

			// hotwords: fst/nn
			if(msg_data->hotwords_embedding == NULL){
			if(msg_data->hotwords_embedding == nullptr){
			std::unordered_map<std::string, int> merged_hws_map;
			std::string nn_hotwords = "";

			@@ -458,7 +458,7 @@
			msg_data->msg["audio_fs"] = jsonresult["audio_fs"];
			}
			if (jsonresult.contains("chunk_size")) {
			if (msg_data->tpass_online_handle == NULL) {
			if (msg_data->tpass_online_handle == nullptr) {
			std::vector<int> chunk_size_vec =
			jsonresult["chunk_size"].get<std::vector<int>>();
			// check chunk_size_vec
			@@ -480,7 +480,7 @@
			if ((jsonresult["is_speaking"] == false \|\|
			jsonresult["is_finished"] == true) &&
			msg_data->msg["is_eof"] != true &&
			msg_data->hotwords_embedding != NULL) {
			msg_data->hotwords_embedding != nullptr) {
			LOG(INFO) << "client done";

			// if it is in final message, post the sample_data to decode
			@@ -532,7 +532,7 @@

			try{
			// post to decode
			if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != NULL) {
			if (msg_data->msg["is_eof"] != true && msg_data->hotwords_embedding != nullptr) {
			std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));
			msg_data->strand_->post(
			std::bind(&WebSocketServer::do_decoder, this,

			@@ -55,13 +55,13 @@
			nlohmann::json msg;
			std::shared_ptr<std::vector<char>> samples;
			std::shared_ptr<std::vector<std::vector<std::string>>> punc_cache;
			std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL;
			std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr;
			std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection
			FUNASR_HANDLE tpass_online_handle=NULL;
			FUNASR_HANDLE tpass_online_handle=nullptr;
			std::string online_res = "";
			std::string tpass_res = "";
			std::shared_ptr<asio::io_context::strand> strand_; // for data execute in order
			FUNASR_DEC_HANDLE decoder_handle=NULL;
			FUNASR_DEC_HANDLE decoder_handle=nullptr;
			} FUNASR_MESSAGE;

			// See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about
			@@ -139,7 +139,7 @@
			asio::io_context& io_decoder_; // threads for asr decoder
			// std::ofstream fout;
			// FUNASR_HANDLE asr_handle; // asr engine handle
			FUNASR_HANDLE tpass_handle=NULL;
			FUNASR_HANDLE tpass_handle=nullptr;
			bool isonline = true; // online or offline engine, now only support offline
			bool is_ssl = true;
			server* server_; // websocket server

			@@ -77,15 +77,16 @@
			std::string stamp_sents="";
			try{
			FUNASR_RESULT Result = FunOfflineInferBuffer(
			asr_handle, buffer.data(), buffer.size(), RASR_NONE, NULL,
			asr_handle, buffer.data(), buffer.size(), RASR_NONE, nullptr,
			hotwords_embedding, audio_fs, wav_format, itn, decoder_handle);
			if (Result != NULL){
			if (Result != nullptr){
			asr_result = FunASRGetResult(Result, 0); // get decode result
			stamp_res = FunASRGetStamp(Result);
			stamp_sents = FunASRGetStampSents(Result);
			FunASRFreeResult(Result);
			} else{
			LOG(ERROR) << "FUNASR_RESULT is NULL.";
			std::this_thread::sleep_for(std::chrono::milliseconds(20));
			LOG(ERROR) << "FUNASR_RESULT is nullptr.";
			}
			}catch (std::exception const& e) {
			LOG(ERROR) << e.what();
			@@ -306,7 +307,7 @@
			}

			// hotwords: fst/nn
			if(msg_data->hotwords_embedding == NULL){
			if(msg_data->hotwords_embedding == nullptr){
			std::unordered_map<std::string, int> merged_hws_map;
			std::string nn_hotwords = "";

			@@ -359,7 +360,7 @@
			if ((jsonresult["is_speaking"] == false \|\|
			jsonresult["is_finished"] == true) &&
			msg_data->msg["is_eof"] != true &&
			msg_data->hotwords_embedding != NULL) {
			msg_data->hotwords_embedding != nullptr) {
			LOG(INFO) << "client done";
			// for offline, send all receive data to decoder engine
			std::vector<std::vector<float>> hotwords_embedding_(*(msg_data->hotwords_embedding));

			@@ -58,9 +58,9 @@
			typedef struct {
			nlohmann::json msg;
			std::shared_ptr<std::vector<char>> samples;
			std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=NULL;
			std::shared_ptr<std::vector<std::vector<float>>> hotwords_embedding=nullptr;
			std::shared_ptr<websocketpp::lib::mutex> thread_lock; // lock for each connection
			FUNASR_DEC_HANDLE decoder_handle=NULL;
			FUNASR_DEC_HANDLE decoder_handle=nullptr;
			} FUNASR_MESSAGE;

			// See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about