python/FunASR-XL.git

parent: 48ee4dd5 | 补丁 | 提交 | ignore whitespace

Merge pull request #493 from alibaba-damo-academy/main

yhliang

2023-05-11 d788b6d5a61df918b65b6914f87a7482778df5f5

Merge pull request #493 from alibaba-damo-academy/main

update dev_lyh

32个文件已修改

3个文件已删除

3个文件已添加

1 文件已重命名

	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh	103 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh	103 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh	103 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/TEMPLATE/README.md	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/TEMPLATE/infer.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/paraformer-server.cc	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/include/funasrruntime.h	28 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/include/vad-model.h	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/readme.md	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/fsmn-vad.cpp	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/fsmn-vad.h	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp	32 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/funasrruntime.cpp	201 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/util.cpp	9 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/util.h	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/vad-model.cpp	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/websocket/websocketsrv.cpp	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/timestamp_tools.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py


 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md

@@ -1 +1 @@
../TEMPLATE/README.md
../../TEMPLATE/README.md

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh

File was deleted

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh

New file
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

set -e
set -u
set -o pipefail

stage=1
stop_stage=2
model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
data_dir="./data/test"
output_dir="./results"
batch_size=64
gpu_inference=true    # whether to perform gpu decoding
gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
checkpoint_dir=
checkpoint_name="valid.cer_ctc.ave.pb"

. utils/parse_options.sh || exit 1;

if ${gpu_inference} == "true"; then
    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
    nj=$njob
    batch_size=1
    gpuid_list=""
    for JOB in $(seq ${nj}); do
        gpuid_list=$gpuid_list"-1,"
    done
fi

mkdir -p $output_dir/split
split_scps=""
for JOB in $(seq ${nj}); do
    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
done
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}

if [ -n "${checkpoint_dir}" ]; then
  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
  model=${checkpoint_dir}/${model}
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
    echo "Decoding ..."
    gpuid_list_array=(${gpuid_list//,/ })
    for JOB in $(seq ${nj}); do
        {
        id=$((JOB-1))
        gpuid=${gpuid_list_array[$id]}
        mkdir -p ${output_dir}/output.$JOB
        python infer.py \
            --model ${model} \
            --audio_in ${output_dir}/split/wav.$JOB.scp \
            --output_dir ${output_dir}/output.$JOB \
            --batch_size ${batch_size} \
            --gpuid ${gpuid}
        }&
    done
    wait

    mkdir -p ${output_dir}/1best_recog
    for f in token score text; do
        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
          for i in $(seq "${nj}"); do
              cat "${output_dir}/output.${i}/1best_recog/${f}"
          done | sort -k1 >"${output_dir}/1best_recog/${f}"
        fi
    done
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
    echo "SpeechIO TIOBE textnorm"
    echo "$0 --> Normalizing REF text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${data_dir}/text \
        ${output_dir}/1best_recog/ref.txt

    echo "$0 --> Normalizing HYP text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${output_dir}/1best_recog/text.proc \
        ${output_dir}/1best_recog/rec.txt
    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt

    echo "$0 --> computing WER/CER and alignment ..."
    ./utils/error_rate_zh \
        --tokenizer char \
        --ref ${output_dir}/1best_recog/ref.txt \
        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
fi


 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md

@@ -1 +1 @@
../TEMPLATE/README.md
../../TEMPLATE/README.md

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py

@@ -1 +1 @@
../TEMPLATE/infer.py
../../TEMPLATE/infer.py

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh

File was deleted

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh

New file
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

set -e
set -u
set -o pipefail

stage=1
stop_stage=2
model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch"
data_dir="./data/test"
output_dir="./results"
batch_size=64
gpu_inference=true    # whether to perform gpu decoding
gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
checkpoint_dir=
checkpoint_name="valid.cer_ctc.ave.pb"

. utils/parse_options.sh || exit 1;

if ${gpu_inference} == "true"; then
    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
    nj=$njob
    batch_size=1
    gpuid_list=""
    for JOB in $(seq ${nj}); do
        gpuid_list=$gpuid_list"-1,"
    done
fi

mkdir -p $output_dir/split
split_scps=""
for JOB in $(seq ${nj}); do
    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
done
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}

if [ -n "${checkpoint_dir}" ]; then
  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
  model=${checkpoint_dir}/${model}
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
    echo "Decoding ..."
    gpuid_list_array=(${gpuid_list//,/ })
    for JOB in $(seq ${nj}); do
        {
        id=$((JOB-1))
        gpuid=${gpuid_list_array[$id]}
        mkdir -p ${output_dir}/output.$JOB
        python infer.py \
            --model ${model} \
            --audio_in ${output_dir}/split/wav.$JOB.scp \
            --output_dir ${output_dir}/output.$JOB \
            --batch_size ${batch_size} \
            --gpuid ${gpuid}
        }&
    done
    wait

    mkdir -p ${output_dir}/1best_recog
    for f in token score text; do
        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
          for i in $(seq "${nj}"); do
              cat "${output_dir}/output.${i}/1best_recog/${f}"
          done | sort -k1 >"${output_dir}/1best_recog/${f}"
        fi
    done
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
    echo "SpeechIO TIOBE textnorm"
    echo "$0 --> Normalizing REF text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${data_dir}/text \
        ${output_dir}/1best_recog/ref.txt

    echo "$0 --> Normalizing HYP text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${output_dir}/1best_recog/text.proc \
        ${output_dir}/1best_recog/rec.txt
    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt

    echo "$0 --> computing WER/CER and alignment ..."
    ./utils/error_rate_zh \
        --tokenizer char \
        --ref ${output_dir}/1best_recog/ref.txt \
        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
fi


 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md

@@ -1 +1 @@
../TEMPLATE/README.md
../../TEMPLATE/README.md

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py

@@ -1 +1 @@
../TEMPLATE/infer.py
../../TEMPLATE/infer.py

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh

File was deleted

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh

New file
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

set -e
set -u
set -o pipefail

stage=1
stop_stage=2
model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch"
data_dir="./data/test"
output_dir="./results"
batch_size=64
gpu_inference=true    # whether to perform gpu decoding
gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
checkpoint_dir=
checkpoint_name="valid.cer_ctc.ave.pb"

. utils/parse_options.sh || exit 1;

if ${gpu_inference} == "true"; then
    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
    nj=$njob
    batch_size=1
    gpuid_list=""
    for JOB in $(seq ${nj}); do
        gpuid_list=$gpuid_list"-1,"
    done
fi

mkdir -p $output_dir/split
split_scps=""
for JOB in $(seq ${nj}); do
    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
done
perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}

if [ -n "${checkpoint_dir}" ]; then
  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
  model=${checkpoint_dir}/${model}
fi

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
    echo "Decoding ..."
    gpuid_list_array=(${gpuid_list//,/ })
    for JOB in $(seq ${nj}); do
        {
        id=$((JOB-1))
        gpuid=${gpuid_list_array[$id]}
        mkdir -p ${output_dir}/output.$JOB
        python infer.py \
            --model ${model} \
            --audio_in ${output_dir}/split/wav.$JOB.scp \
            --output_dir ${output_dir}/output.$JOB \
            --batch_size ${batch_size} \
            --gpuid ${gpuid}
        }&
    done
    wait

    mkdir -p ${output_dir}/1best_recog
    for f in token score text; do
        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
          for i in $(seq "${nj}"); do
              cat "${output_dir}/output.${i}/1best_recog/${f}"
          done | sort -k1 >"${output_dir}/1best_recog/${f}"
        fi
    done
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
    echo "Computing WER ..."
    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
    tail -n 3 ${output_dir}/1best_recog/text.cer
fi

if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
    echo "SpeechIO TIOBE textnorm"
    echo "$0 --> Normalizing REF text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${data_dir}/text \
        ${output_dir}/1best_recog/ref.txt

    echo "$0 --> Normalizing HYP text ..."
    ./utils/textnorm_zh.py \
        --has_key --to_upper \
        ${output_dir}/1best_recog/text.proc \
        ${output_dir}/1best_recog/rec.txt
    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt

    echo "$0 --> computing WER/CER and alignment ..."
    ./utils/error_rate_zh \
        --tokenizer char \
        --ref ${output_dir}/1best_recog/ref.txt \
        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
fi


 egs_modelscope/vad/TEMPLATE/README.md

@@ -83,7 +83,7 @@
#### Decode with multi GPUs:
```shell
    bash infer.sh \
    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
    --data_dir "./data/test" \
    --output_dir "./results" \
    --batch_size 1 \
@@ -93,11 +93,11 @@
#### Decode with multi-thread CPUs:
```shell
    bash infer.sh \
    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
    --data_dir "./data/test" \
    --output_dir "./results" \
    --gpu_inference false \
    --njob 1
    --njob 64
```

## Finetune with pipeline

 egs_modelscope/vad/TEMPLATE/infer.py

@@ -16,10 +16,10 @@

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
    parser.add_argument('--model', type=str, default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch")
    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
    parser.add_argument('--output_dir', type=str, default="./results/")
    parser.add_argument('--batch_size', type=int, default=64)
    parser.add_argument('--batch_size', type=int, default=1)
    parser.add_argument('--gpuid', type=str, default="0")
    args = parser.parse_args()
    modelscope_infer(args)

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md

@@ -1 +1 @@
../../TEMPLATE/README.md
../TEMPLATE/README.md

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py

@@ -7,7 +7,7 @@
    inference_pipeline = pipeline(
        task=Tasks.voice_activity_detection,
        model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
        model_revision='v1.2.0',
        model_revision=None,
        output_dir=output_dir,
        batch_size=1,
    )

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py

@@ -1 +1 @@
../../TEMPLATE/infer.py
../TEMPLATE/infer.py

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh

@@ -1 +1 @@
../../TEMPLATE/infer.sh
../TEMPLATE/infer.sh

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md

@@ -1 +1 @@
../../TEMPLATE/README.md
../TEMPLATE/README.md

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py

@@ -7,7 +7,7 @@
    inference_pipeline = pipeline(
        task=Tasks.voice_activity_detection,
        model="damo/speech_fsmn_vad_zh-cn-8k-common",
        model_revision='v1.2.0',
        model_revision=None,
        output_dir=output_dir,
        batch_size=1,
    )

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py

@@ -11,7 +11,7 @@
    inference_pipeline = pipeline(
        task=Tasks.voice_activity_detection,
        model="damo/speech_fsmn_vad_zh-cn-8k-common",
        model_revision='v1.2.0',
        model_revision=None,
        output_dir=output_dir,
        batch_size=1,
        mode='online',

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py

@@ -1 +1 @@
../../TEMPLATE/infer.py
../TEMPLATE/infer.py

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh

@@ -1 +1 @@
../../TEMPLATE/infer.sh
../TEMPLATE/infer.sh

 funasr/runtime/grpc/paraformer-server.cc

@@ -137,7 +137,7 @@
                    stream->Write(res);
                }
                else {
                    FUNASR_RESULT Result= FunOfflineRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, 16000, RASR_NONE, NULL);
                    FUNASR_RESULT Result= FunOfflineInferBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL, 16000);
                    std::string asr_result = ((FUNASR_RECOG_RESULT*)Result)->msg;

                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

 funasr/runtime/onnxruntime/include/funasrruntime.h

@@ -46,15 +46,20 @@
    FUNASR_MODEL_PARAFORMER = 3,
}FUNASR_MODEL_TYPE;

typedef enum
{
 FSMN_VAD_OFFLINE=0,
 FSMN_VAD_ONLINE = 1,
}FSMN_VAD_MODE;

typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step.
    
// ASR
_FUNASRAPI FUNASR_HANDLE      FunASRInit(std::map<std::string, std::string>& model_path, int thread_num);

_FUNASRAPI FUNASR_RESULT    FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback);
_FUNASRAPI FUNASR_RESULT    FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
_FUNASRAPI FUNASR_RESULT    FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
_FUNASRAPI FUNASR_RESULT    FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
// buffer
_FUNASRAPI FUNASR_RESULT    FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
// file, support wav & pcm
_FUNASRAPI FUNASR_RESULT    FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);

_FUNASRAPI const char*    FunASRGetResult(FUNASR_RESULT result,int n_index);
_FUNASRAPI const int    FunASRGetRetNumber(FUNASR_RESULT result);
@@ -63,9 +68,12 @@
_FUNASRAPI const float    FunASRGetRetSnippetTime(FUNASR_RESULT result);

// VAD
_FUNASRAPI FUNASR_HANDLE      FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num);
_FUNASRAPI FUNASR_HANDLE      FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode=FSMN_VAD_OFFLINE);
// buffer
_FUNASRAPI FUNASR_RESULT    FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
// file, support wav & pcm
_FUNASRAPI FUNASR_RESULT    FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);

_FUNASRAPI FUNASR_RESULT    FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
_FUNASRAPI std::vector<std::vector<int>>*    FsmnVadGetResult(FUNASR_RESULT result,int n_index);
_FUNASRAPI void                 FsmnVadFreeResult(FUNASR_RESULT result);
_FUNASRAPI void                FsmnVadUninit(FUNASR_HANDLE handle);
@@ -78,8 +86,10 @@

//OfflineStream
_FUNASRAPI FUNASR_HANDLE      FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num);
_FUNASRAPI FUNASR_RESULT     FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
_FUNASRAPI FUNASR_RESULT    FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
// buffer
_FUNASRAPI FUNASR_RESULT    FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
// file, support wav & pcm
_FUNASRAPI FUNASR_RESULT    FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
_FUNASRAPI void                FunOfflineUninit(FUNASR_HANDLE handle);

#ifdef __cplusplus 

 funasr/runtime/onnxruntime/include/vad-model.h

@@ -16,7 +16,7 @@
    virtual void LoadConfigFromYaml(const char* filename)=0;
    virtual void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
                    const std::vector<float> &waves)=0;
    virtual std::vector<std::vector<float>> &LfrCmvn(std::vector<std::vector<float>> &vad_feats)=0;
    virtual void LfrCmvn(std::vector<std::vector<float>> &vad_feats)=0;
    virtual void Forward(
            const std::vector<std::vector<float>> &chunk_feats,
            std::vector<std::vector<float>> *out_prob)=0;
@@ -24,6 +24,6 @@
    virtual void InitCache()=0;
};

VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num);
VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode);
} // namespace funasr
#endif

 funasr/runtime/onnxruntime/readme.md

@@ -43,11 +43,10 @@

### funasr-onnx-offline
```shell
./funasr-onnx-offline     [--wav-scp <string>] [--wav-path <string>]
                          [--punc-quant <string>] [--punc-dir <string>]
                          [--vad-quant <string>] [--vad-dir <string>]
                          [--quantize <string>] --model-dir <string>
                          [--] [--version] [-h]
./funasr-onnx-offline     --model-dir <string> [--quantize <string>]
                          [--vad-dir <string>] [--vad-quant <string>]
                          [--punc-dir <string>] [--punc-quant <string>]
                          --wav-path <string> [--] [--version] [-h]
Where:
   --model-dir <string>
     (required)  the asr model path, which contains model.onnx, config.yaml, am.mvn
@@ -64,12 +63,13 @@
   --punc-quant <string>
     false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir

   --wav-scp <string>
     wave scp path
   --wav-path <string>
     wave file path
     (required)  the input could be: 
      wav_path, e.g.: asr_example.wav;
      pcm_path, e.g.: asr_example.pcm; 
      wav.scp, kaldi style wav list (wav_id \t wav_path)
  
   Required: --model-dir <string>
   Required: --model-dir <string> --wav-path <string>
   If use vad, please add: --vad-dir <string>
   If use punc, please add: --punc-dir <string>

@@ -84,20 +84,20 @@

### funasr-onnx-offline-vad
```shell
./funasr-onnx-offline-vad     [--wav-scp <string>] [--wav-path <string>]
                              [--quantize <string>] --model-dir <string>
                              [--] [--version] [-h]
./funasr-onnx-offline-vad     --model-dir <string> [--quantize <string>]
                              --wav-path <string> [--] [--version] [-h]
Where:
   --model-dir <string>
     (required)  the vad model path, which contains model.onnx, vad.yaml, vad.mvn
   --quantize <string>
     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
   --wav-scp <string>
     wave scp path
   --wav-path <string>
     wave file path
     (required)  the input could be: 
      wav_path, e.g.: asr_example.wav;
      pcm_path, e.g.: asr_example.pcm; 
      wav.scp, kaldi style wav list (wav_id \t wav_path)

   Required: --model-dir <string>
   Required: --model-dir <string> --wav-path <string>

For example:
./funasr-onnx-offline-vad \
@@ -107,17 +107,17 @@

### funasr-onnx-offline-punc
```shell
./funasr-onnx-offline-punc    [--txt-path <string>] [--quantize <string>]
                               --model-dir <string> [--] [--version] [-h]
./funasr-onnx-offline-punc    --model-dir <string> [--quantize <string>]
                              --txt-path <string> [--] [--version] [-h]
Where:
   --model-dir <string>
     (required)  the punc model path, which contains model.onnx, punc.yaml
   --quantize <string>
     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
   --txt-path <string>
     txt file path, one sentence per line
     (required)  txt file path, one sentence per line

   Required: --model-dir <string>
   Required: --model-dir <string> --txt-path <string>

For example:
./funasr-onnx-offline-punc \
@@ -126,8 +126,8 @@
```
### funasr-onnx-offline-rtf
```shell
./funasr-onnx-offline-rtf     --thread-num <int32_t> --wav-scp <string>
                              [--quantize <string>] --model-dir <string>
./funasr-onnx-offline-rtf     --model-dir <string> [--quantize <string>]
                              --wav-path <string> --thread-num <int32_t>
                              [--] [--version] [-h]
Where:
   --thread-num <int32_t>
@@ -136,14 +136,17 @@
     (required)  the model path, which contains model.onnx, config.yaml, am.mvn
   --quantize <string>
     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
   --wav-scp <string>
     (required)  wave scp path
   --wav-path <string>
     (required)  the input could be: 
      wav_path, e.g.: asr_example.wav;
      pcm_path, e.g.: asr_example.pcm; 
      wav.scp, kaldi style wav list (wav_id \t wav_path)

For example:
./funasr-onnx-offline-rtf \
    --model-dir    ./asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
    --quantize  true \
    --wav-scp     ./aishell1_test.scp  \
    --wav-path     ./aishell1_test.scp  \
    --thread-num 32
```


 funasr/runtime/onnxruntime/src/fsmn-vad.cpp

@@ -225,7 +225,7 @@
    }
}

std::vector<std::vector<float>> &FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {

    std::vector<std::vector<float>> out_feats;
    int T = vad_feats.size();
@@ -264,7 +264,6 @@
        }
    }
    vad_feats = out_feats;
    return vad_feats;
}

std::vector<std::vector<int>>
@@ -272,7 +271,7 @@
    std::vector<std::vector<float>> vad_feats;
    std::vector<std::vector<float>> vad_probs;
    FbankKaldi(vad_sample_rate_, vad_feats, waves);
    vad_feats = LfrCmvn(vad_feats);
    LfrCmvn(vad_feats);
    Forward(vad_feats, &vad_probs);

    E2EVadModel vad_scorer = E2EVadModel();

 funasr/runtime/onnxruntime/src/fsmn-vad.h

@@ -36,7 +36,7 @@
    void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
                    const std::vector<float> &waves);

    std::vector<std::vector<float>> &LfrCmvn(std::vector<std::vector<float>> &vad_feats);
    void LfrCmvn(std::vector<std::vector<float>> &vad_feats);

    void Forward(
            const std::vector<std::vector<float>> &chunk_feats,

 funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp

@@ -36,7 +36,7 @@
    TCLAP::CmdLine cmd("funasr-onnx-offline-punc", ' ', "1.0");
    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the punc model path, which contains model.onnx, punc.yaml", true, "", "string");
    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
    TCLAP::ValueArg<std::string> txt_path("", TXT_PATH, "txt file path, one sentence per line", false, "", "string");
    TCLAP::ValueArg<std::string> txt_path("", TXT_PATH, "txt file path, one sentence per line", true, "", "string");

    cmd.add(model_dir);
    cmd.add(quantize);

 funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp

@@ -39,7 +39,7 @@
    // warm up
    for (size_t i = 0; i < 1; i++)
    {
        FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL);
        FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, 16000);
    }

    while (true) {
@@ -50,7 +50,7 @@
        }

        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL);
        FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, 16000);

        gettimeofday(&end, NULL);
        seconds = (end.tv_sec - start.tv_sec);
@@ -77,6 +77,15 @@
    }
}

bool is_target_file(const std::string& filename, const std::string target) {
    std::size_t pos = filename.find_last_of(".");
    if (pos == std::string::npos) {
        return false;
    }
    std::string extension = filename.substr(pos + 1);
    return (extension == target);
}

void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
{
    if (value_arg.isSet()){
@@ -94,19 +103,19 @@
    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string");
    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");

    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", true, "", "string");
    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");
    TCLAP::ValueArg<std::int32_t> thread_num("", THREAD_NUM, "multi-thread num for rtf", true, 0, "int32_t");

    cmd.add(model_dir);
    cmd.add(quantize);
    cmd.add(wav_scp);
    cmd.add(wav_path);
    cmd.add(thread_num);
    cmd.parse(argc, argv);

    std::map<std::string, std::string> model_path;
    GetValue(model_dir, MODEL_DIR, model_path);
    GetValue(quantize, QUANTIZE, model_path);
    GetValue(wav_scp, WAV_SCP, model_path);
    GetValue(wav_path, WAV_PATH, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
@@ -125,10 +134,14 @@

    // read wav_scp
    vector<string> wav_list;
    if(model_path.find(WAV_SCP)!=model_path.end()){
        ifstream in(model_path.at(WAV_SCP));
    string wav_path_ = model_path.at(WAV_PATH);
    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
        wav_list.emplace_back(wav_path_);
    }
    else if(is_target_file(wav_path_, "scp")){
        ifstream in(wav_path_);
        if (!in.is_open()) {
            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP);
            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
            return 0;
        }
        string line;
@@ -140,6 +153,9 @@
            wav_list.emplace_back(column2); 
        }
        in.close();
    }else{
        LOG(ERROR)<<"Please check the wav extension!";
        exit(-1);
    }

    // 多线程测试

 funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp

@@ -21,6 +21,15 @@

using namespace std;

bool is_target_file(const std::string& filename, const std::string target) {
    std::size_t pos = filename.find_last_of(".");
    if (pos == std::string::npos) {
        return false;
    }
    std::string extension = filename.substr(pos + 1);
    return (extension == target);
}

void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
{
    if (value_arg.isSet()){
@@ -58,20 +67,17 @@
    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", true, "", "string");
    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");

    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false, "", "string");
    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false, "", "string");
    TCLAP::ValueArg<std::string>    wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");

    cmd.add(model_dir);
    cmd.add(quantize);
    cmd.add(wav_path);
    cmd.add(wav_scp);
    cmd.parse(argc, argv);

    std::map<std::string, std::string> model_path;
    GetValue(model_dir, MODEL_DIR, model_path);
    GetValue(quantize, QUANTIZE, model_path);
    GetValue(wav_path, WAV_PATH, model_path);
    GetValue(wav_scp, WAV_SCP, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
@@ -89,14 +95,14 @@
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

    // read wav_path and wav_scp
    // read wav_path
    vector<string> wav_list;

    if(model_path.find(WAV_PATH)!=model_path.end()){
        wav_list.emplace_back(model_path.at(WAV_PATH));
    string wav_path_ = model_path.at(WAV_PATH);
    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
        wav_list.emplace_back(wav_path_);
    }
    if(model_path.find(WAV_SCP)!=model_path.end()){
        ifstream in(model_path.at(WAV_SCP));
    else if(is_target_file(wav_path_, "scp")){
        ifstream in(wav_path_);
        if (!in.is_open()) {
            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
            return 0;
@@ -110,13 +116,16 @@
            wav_list.emplace_back(column2); 
        }
        in.close();
    }else{
        LOG(ERROR)<<"Please check the wav extension!";
        exit(-1);
    }
    
    float snippet_time = 0.0f;
    long taking_micros = 0;
    for(auto& wav_file : wav_list){
        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FsmnVadWavFile(vad_hanlde, wav_file.c_str(), RASR_NONE, NULL);
        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000);
        gettimeofday(&end, NULL);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

 funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp

@@ -20,6 +20,15 @@

using namespace std;

bool is_target_file(const std::string& filename, const std::string target) {
    std::size_t pos = filename.find_last_of(".");
    if (pos == std::string::npos) {
        return false;
    }
    std::string extension = filename.substr(pos + 1);
    return (extension == target);
}

void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
{
    if (value_arg.isSet()){
@@ -41,8 +50,7 @@
    TCLAP::ValueArg<std::string>    punc_dir("", PUNC_DIR, "the punc model path, which contains model.onnx, punc.yaml", false, "", "string");
    TCLAP::ValueArg<std::string>    punc_quant("", PUNC_QUANT, "false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir", false, "false", "string");

    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false, "", "string");
    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false, "", "string");
    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");

    cmd.add(model_dir);
    cmd.add(quantize);
@@ -51,7 +59,6 @@
    cmd.add(punc_dir);
    cmd.add(punc_quant);
    cmd.add(wav_path);
    cmd.add(wav_scp);
    cmd.parse(argc, argv);

    std::map<std::string, std::string> model_path;
@@ -62,7 +69,6 @@
    GetValue(punc_dir, PUNC_DIR, model_path);
    GetValue(punc_quant, PUNC_QUANT, model_path);
    GetValue(wav_path, WAV_PATH, model_path);
    GetValue(wav_scp, WAV_SCP, model_path);

    struct timeval start, end;
    gettimeofday(&start, NULL);
@@ -80,14 +86,14 @@
    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

    // read wav_path and wav_scp
    // read wav_path
    vector<string> wav_list;

    if(model_path.find(WAV_PATH)!=model_path.end()){
        wav_list.emplace_back(model_path.at(WAV_PATH));
    string wav_path_ = model_path.at(WAV_PATH); 
    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
        wav_list.emplace_back(wav_path_);
    }
    if(model_path.find(WAV_SCP)!=model_path.end()){
        ifstream in(model_path.at(WAV_SCP));
    else if(is_target_file(wav_path_, "scp")){
        ifstream in(wav_path_);
        if (!in.is_open()) {
            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
            return 0;
@@ -101,13 +107,16 @@
            wav_list.emplace_back(column2); 
        }
        in.close();
    }else{
        LOG(ERROR)<<"Please check the wav extension!";
        exit(-1);
    }
    
    float snippet_time = 0.0f;
    long taking_micros = 0;
    for(auto& wav_file : wav_list){
        gettimeofday(&start, NULL);
        FUNASR_RESULT result=FunOfflineRecogFile(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL);
        FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, 16000);
        gettimeofday(&end, NULL);
        seconds = (end.tv_sec - start.tv_sec);
        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

 funasr/runtime/onnxruntime/src/funasrruntime.cpp

@@ -11,9 +11,9 @@
        return mm;
    }

    _FUNASRAPI FUNASR_HANDLE  FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num)
    _FUNASRAPI FUNASR_HANDLE  FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode)
    {
        funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num);
        funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num, mode);
        return mm;
    }

@@ -30,36 +30,7 @@
    }

    // APIs for ASR Infer
    _FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    {
        funasr::Model* recog_obj = (funasr::Model*)handle;
        if (!recog_obj)
            return nullptr;

        int32_t sampling_rate = -1;
        funasr::Audio audio(1);
        if (!audio.LoadWav(sz_buf, n_len, &sampling_rate))
            return nullptr;

        float* buff;
        int len;
        int flag=0;
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        int n_step = 0;
        int n_total = audio.GetQueueSize();
        while (audio.Fetch(buff, len, flag) > 0) {
            string msg = recog_obj->Forward(buff, len, flag);
            p_result->msg += msg;
            n_step++;
            if (fn_callback)
                fn_callback(n_step, n_total);
        }

        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    _FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::Model* recog_obj = (funasr::Model*)handle;
        if (!recog_obj)
@@ -87,23 +58,32 @@
        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    _FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::Model* recog_obj = (funasr::Model*)handle;
        if (!recog_obj)
            return nullptr;

        funasr::Audio audio(1);
        if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
            return nullptr;
        if(funasr::is_target_file(sz_filename, "wav")){
            int32_t sampling_rate_ = -1;
            if(!audio.LoadWav(sz_filename, &sampling_rate_))
                return nullptr;
        }else if(funasr::is_target_file(sz_filename, "pcm")){
            if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
                return nullptr;
        }else{
            LOG(ERROR)<<"Wrong wav extension";
            exit(-1);
        }

        float* buff;
        int len;
        int flag = 0;
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        int n_step = 0;
        int n_total = audio.GetQueueSize();
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        while (audio.Fetch(buff, len, flag) > 0) {
            string msg = recog_obj->Forward(buff, len, flag);
            p_result->msg += msg;
@@ -115,46 +95,45 @@
        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    {
        funasr::Model* recog_obj = (funasr::Model*)handle;
        if (!recog_obj)
            return nullptr;
		
        int32_t sampling_rate = -1;
        funasr::Audio audio(1);
        if(!audio.LoadWav(sz_wavfile, &sampling_rate))
            return nullptr;

        float* buff;
        int len;
        int flag = 0;
        int n_step = 0;
        int n_total = audio.GetQueueSize();
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        while (audio.Fetch(buff, len, flag) > 0) {
            string msg = recog_obj->Forward(buff, len, flag);
            p_result->msg+= msg;
            n_step++;
            if (fn_callback)
                fn_callback(n_step, n_total);
        }
	
        return p_result;
    }

    // APIs for VAD Infer
    _FUNASRAPI FUNASR_RESULT FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    _FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
        if (!vad_obj)
            return nullptr;
		
        int32_t sampling_rate = -1;

        funasr::Audio audio(1);
        if(!audio.LoadWav(sz_wavfile, &sampling_rate))
        if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
            return nullptr;

        funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
		
        vector<std::vector<int>> vad_segments;
        audio.Split(vad_obj, vad_segments);
        p_result->segments = new vector<std::vector<int>>(vad_segments);

        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
        if (!vad_obj)
            return nullptr;

        funasr::Audio audio(1);
        if(funasr::is_target_file(sz_filename, "wav")){
            int32_t sampling_rate_ = -1;
            if(!audio.LoadWav(sz_filename, &sampling_rate_))
                return nullptr;
        }else if(funasr::is_target_file(sz_filename, "pcm")){
            if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
                return nullptr;
        }else{
            LOG(ERROR)<<"Wrong wav extension";
            exit(-1);
        }

        funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
@@ -178,43 +157,7 @@
    }

    // APIs for Offline-stream Infer
    _FUNASRAPI FUNASR_RESULT FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    {
        funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
        if (!offline_stream)
            return nullptr;
		
        int32_t sampling_rate = -1;
        funasr::Audio audio(1);
        if(!audio.LoadWav(sz_wavfile, &sampling_rate))
            return nullptr;
        if(offline_stream->UseVad()){
            audio.Split(offline_stream);
        }

        float* buff;
        int len;
        int flag = 0;
        int n_step = 0;
        int n_total = audio.GetQueueSize();
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        while (audio.Fetch(buff, len, flag) > 0) {
            string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
            p_result->msg+= msg;
            n_step++;
            if (fn_callback)
                fn_callback(n_step, n_total);
        }
        if(offline_stream->UsePunc()){
            string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
            p_result->msg = punc_res;
        }
	
        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
    _FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
        if (!offline_stream)
@@ -249,6 +192,50 @@
        return p_result;
    }

    _FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
    {
        funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
        if (!offline_stream)
            return nullptr;
		
        funasr::Audio audio(1);
        if(funasr::is_target_file(sz_filename, "wav")){
            int32_t sampling_rate_ = -1;
            if(!audio.LoadWav(sz_filename, &sampling_rate_))
                return nullptr;
        }else if(funasr::is_target_file(sz_filename, "pcm")){
            if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
                return nullptr;
        }else{
            LOG(ERROR)<<"Wrong wav extension";
            exit(-1);
        }
        if(offline_stream->UseVad()){
            audio.Split(offline_stream);
        }

        float* buff;
        int len;
        int flag = 0;
        int n_step = 0;
        int n_total = audio.GetQueueSize();
        funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
        p_result->snippet_time = audio.GetTimeLen();
        while (audio.Fetch(buff, len, flag) > 0) {
            string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
            p_result->msg+= msg;
            n_step++;
            if (fn_callback)
                fn_callback(n_step, n_total);
        }
        if(offline_stream->UsePunc()){
            string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
            p_result->msg = punc_res;
        }
	
        return p_result;
    }

    _FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result)
    {
        if (!result)

 funasr/runtime/onnxruntime/src/util.cpp

@@ -180,4 +180,13 @@
    }
}

bool is_target_file(const std::string& filename, const std::string target) {
    std::size_t pos = filename.find_last_of(".");
    if (pos == std::string::npos) {
        return false;
    }
    std::string extension = filename.substr(pos + 1);
    return (extension == target);
}

} // namespace funasr

 funasr/runtime/onnxruntime/src/util.h

@@ -25,6 +25,7 @@
extern void Glu(Tensor<float> *din, Tensor<float> *dout);

string PathAppend(const string &p1, const string &p2);
bool is_target_file(const std::string& filename, const std::string target);

} // namespace funasr
#endif

 funasr/runtime/onnxruntime/src/vad-model.cpp

@@ -1,10 +1,14 @@
#include "precomp.h"

namespace funasr {
VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num)
VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode)
{
    VadModel *mm;
    mm = new FsmnVad();
    if(mode == FSMN_VAD_OFFLINE){
        mm = new FsmnVad();
    }else{
        LOG(ERROR)<<"Online fsmn vad not imp!";
    }

    string vad_model_path;
    string vad_cmvn_path;

 funasr/runtime/websocket/websocketsrv.cpp

@@ -25,8 +25,8 @@
    if (!buffer.empty()) {

      // fout.write(buffer.data(), buffer.size());

      // feed data to asr engine

      FUNASR_RESULT Result = FunOfflineRecogPCMBuffer(

          asr_hanlde, buffer.data(), buffer.size(), 16000, RASR_NONE, NULL);

      FUNASR_RESULT Result = FunOfflineInferBuffer(

          asr_hanlde, buffer.data(), buffer.size(), RASR_NONE, NULL, 16000);



      std::string asr_result =

          ((FUNASR_RECOG_RESULT*)Result)->msg;  // get decode result


 funasr/utils/timestamp_tools.py

@@ -80,6 +80,7 @@


def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed):
    punc_list = ['，', '。', '？', '、']
    res = []
    if text_postprocessed is None:
        return res
@@ -124,34 +125,8 @@
        punc_id = int(punc_id) if punc_id is not None else 1
        sentence_end = time_stamp[1] if time_stamp is not None else sentence_end

        if punc_id == 2:
            sentence_text += ','
            res.append({
                'text': sentence_text,
                "start": sentence_start,
                "end": sentence_end,
                "text_seg": sentence_text_seg,
                "ts_list": ts_list
            })
            sentence_text = ''
            sentence_text_seg = ''
            ts_list = []
            sentence_start = sentence_end
        elif punc_id == 3:
            sentence_text += '.'
            res.append({
                'text': sentence_text,
                "start": sentence_start,
                "end": sentence_end,
                "text_seg": sentence_text_seg,
                "ts_list": ts_list
            })
            sentence_text = ''
            sentence_text_seg = ''
            ts_list = []
            sentence_start = sentence_end
        elif punc_id == 4:
            sentence_text += '?'
        if punc_id > 1:
            sentence_text += punc_list[punc_id - 2]
            res.append({
                'text': sentence_text,
                "start": sentence_start,

New file
			@@ -0,0 +1,103 @@
			#!/usr/bin/env bash

			set -e
			set -u
			set -o pipefail

			stage=1
			stop_stage=2
			model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			data_dir="./data/test"
			output_dir="./results"
			batch_size=64
			gpu_inference=true # whether to perform gpu decoding
			gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
			njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
			checkpoint_dir=
			checkpoint_name="valid.cer_ctc.ave.pb"

			. utils/parse_options.sh \|\| exit 1;

			if ${gpu_inference} == "true"; then
			nj=$(echo $gpuid_list \| awk -F "," '{print NF}')
			else
			nj=$njob
			batch_size=1
			gpuid_list=""
			for JOB in $(seq ${nj}); do
			gpuid_list=$gpuid_list"-1,"
			done
			fi

			mkdir -p $output_dir/split
			split_scps=""
			for JOB in $(seq ${nj}); do
			split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
			done
			perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}

			if [ -n "${checkpoint_dir}" ]; then
			python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
			model=${checkpoint_dir}/${model}
			fi

			if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
			echo "Decoding ..."
			gpuid_list_array=(${gpuid_list//,/ })
			for JOB in $(seq ${nj}); do
			{
			id=$((JOB-1))
			gpuid=${gpuid_list_array[$id]}
			mkdir -p ${output_dir}/output.$JOB
			python infer.py \
			--model ${model} \
			--audio_in ${output_dir}/split/wav.$JOB.scp \
			--output_dir ${output_dir}/output.$JOB \
			--batch_size ${batch_size} \
			--gpuid ${gpuid}
			}&
			done
			wait

			mkdir -p ${output_dir}/1best_recog
			for f in token score text; do
			if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
			for i in $(seq "${nj}"); do
			cat "${output_dir}/output.${i}/1best_recog/${f}"
			done \| sort -k1 >"${output_dir}/1best_recog/${f}"
			fi
			done
			fi

			if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
			echo "Computing WER ..."
			cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
			cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
			python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
			tail -n 3 ${output_dir}/1best_recog/text.cer
			fi

			if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
			echo "SpeechIO TIOBE textnorm"
			echo "$0 --> Normalizing REF text ..."
			./utils/textnorm_zh.py \
			--has_key --to_upper \
			${data_dir}/text \
			${output_dir}/1best_recog/ref.txt

			echo "$0 --> Normalizing HYP text ..."
			./utils/textnorm_zh.py \
			--has_key --to_upper \
			${output_dir}/1best_recog/text.proc \
			${output_dir}/1best_recog/rec.txt
			grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt

			echo "$0 --> computing WER/CER and alignment ..."
			./utils/error_rate_zh \
			--tokenizer char \
			--ref ${output_dir}/1best_recog/ref.txt \
			--hyp ${output_dir}/1best_recog/rec_non_empty.txt \
			${output_dir}/1best_recog/DETAILS.txt \| tee ${output_dir}/1best_recog/RESULTS.txt
			rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
			fi

			@@ -83,7 +83,7 @@
			#### Decode with multi GPUs:
			```shell
			bash infer.sh \
			--model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
			--model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
			--data_dir "./data/test" \
			--output_dir "./results" \
			--batch_size 1 \
			@@ -93,11 +93,11 @@
			#### Decode with multi-thread CPUs:
			```shell
			bash infer.sh \
			--model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
			--model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
			--data_dir "./data/test" \
			--output_dir "./results" \
			--gpu_inference false \
			--njob 1
			--njob 64
			```

			## Finetune with pipeline

			@@ -16,10 +16,10 @@

			if __name__ == "__main__":
			parser = argparse.ArgumentParser()
			parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
			parser.add_argument('--model', type=str, default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch")
			parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
			parser.add_argument('--output_dir', type=str, default="./results/")
			parser.add_argument('--batch_size', type=int, default=64)
			parser.add_argument('--batch_size', type=int, default=1)
			parser.add_argument('--gpuid', type=str, default="0")
			args = parser.parse_args()
			modelscope_infer(args)

			@@ -7,7 +7,7 @@
			inference_pipeline = pipeline(
			task=Tasks.voice_activity_detection,
			model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
			model_revision='v1.2.0',
			model_revision=None,
			output_dir=output_dir,
			batch_size=1,
			)

			@@ -11,7 +11,7 @@
			inference_pipeline = pipeline(
			task=Tasks.voice_activity_detection,
			model="damo/speech_fsmn_vad_zh-cn-8k-common",
			model_revision='v1.2.0',
			model_revision=None,
			output_dir=output_dir,
			batch_size=1,
			mode='online',

			@@ -137,7 +137,7 @@
			stream->Write(res);
			}
			else {
			FUNASR_RESULT Result= FunOfflineRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, 16000, RASR_NONE, NULL);
			FUNASR_RESULT Result= FunOfflineInferBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL, 16000);
			std::string asr_result = ((FUNASR_RECOG_RESULT*)Result)->msg;

			auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

			@@ -46,15 +46,20 @@
			FUNASR_MODEL_PARAFORMER = 3,
			}FUNASR_MODEL_TYPE;

			typedef enum
			{
			FSMN_VAD_OFFLINE=0,
			FSMN_VAD_ONLINE = 1,
			}FSMN_VAD_MODE;

			typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step.

			// ASR
			_FUNASRAPI FUNASR_HANDLE FunASRInit(std::map<std::string, std::string>& model_path, int thread_num);

			_FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			_FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			// buffer
			_FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
			// file, support wav & pcm
			_FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);

			_FUNASRAPI const char* FunASRGetResult(FUNASR_RESULT result,int n_index);
			_FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result);
			@@ -63,9 +68,12 @@
			_FUNASRAPI const float FunASRGetRetSnippetTime(FUNASR_RESULT result);

			// VAD
			_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num);
			_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode=FSMN_VAD_OFFLINE);
			// buffer
			_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
			// file, support wav & pcm
			_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);

			_FUNASRAPI FUNASR_RESULT FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			_FUNASRAPI std::vector<std::vector<int>>* FsmnVadGetResult(FUNASR_RESULT result,int n_index);
			_FUNASRAPI void FsmnVadFreeResult(FUNASR_RESULT result);
			_FUNASRAPI void FsmnVadUninit(FUNASR_HANDLE handle);
			@@ -78,8 +86,10 @@

			//OfflineStream
			_FUNASRAPI FUNASR_HANDLE FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num);
			_FUNASRAPI FUNASR_RESULT FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			_FUNASRAPI FUNASR_RESULT FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
			// buffer
			_FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
			// file, support wav & pcm
			_FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
			_FUNASRAPI void FunOfflineUninit(FUNASR_HANDLE handle);

			#ifdef __cplusplus

			@@ -16,7 +16,7 @@
			virtual void LoadConfigFromYaml(const char* filename)=0;
			virtual void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
			const std::vector<float> &waves)=0;
			virtual std::vector<std::vector<float>> &LfrCmvn(std::vector<std::vector<float>> &vad_feats)=0;
			virtual void LfrCmvn(std::vector<std::vector<float>> &vad_feats)=0;
			virtual void Forward(
			const std::vector<std::vector<float>> &chunk_feats,
			std::vector<std::vector<float>> *out_prob)=0;
			@@ -24,6 +24,6 @@
			virtual void InitCache()=0;
			};

			VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num);
			VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode);
			} // namespace funasr
			#endif

			@@ -43,11 +43,10 @@

			### funasr-onnx-offline
			```shell
			./funasr-onnx-offline [--wav-scp <string>] [--wav-path <string>]
			[--punc-quant <string>] [--punc-dir <string>]
			[--vad-quant <string>] [--vad-dir <string>]
			[--quantize <string>] --model-dir <string>
			[--] [--version] [-h]
			./funasr-onnx-offline --model-dir <string> [--quantize <string>]
			[--vad-dir <string>] [--vad-quant <string>]
			[--punc-dir <string>] [--punc-quant <string>]
			--wav-path <string> [--] [--version] [-h]
			Where:
			--model-dir <string>
			(required) the asr model path, which contains model.onnx, config.yaml, am.mvn
			@@ -64,12 +63,13 @@
			--punc-quant <string>
			false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir

			--wav-scp <string>
			wave scp path
			--wav-path <string>
			wave file path
			(required) the input could be:
			wav_path, e.g.: asr_example.wav;
			pcm_path, e.g.: asr_example.pcm;
			wav.scp, kaldi style wav list (wav_id \t wav_path)

			Required: --model-dir <string>
			Required: --model-dir <string> --wav-path <string>
			If use vad, please add: --vad-dir <string>
			If use punc, please add: --punc-dir <string>

			@@ -84,20 +84,20 @@

			### funasr-onnx-offline-vad
			```shell
			./funasr-onnx-offline-vad [--wav-scp <string>] [--wav-path <string>]
			[--quantize <string>] --model-dir <string>
			[--] [--version] [-h]
			./funasr-onnx-offline-vad --model-dir <string> [--quantize <string>]
			--wav-path <string> [--] [--version] [-h]
			Where:
			--model-dir <string>
			(required) the vad model path, which contains model.onnx, vad.yaml, vad.mvn
			--quantize <string>
			false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
			--wav-scp <string>
			wave scp path
			--wav-path <string>
			wave file path
			(required) the input could be:
			wav_path, e.g.: asr_example.wav;
			pcm_path, e.g.: asr_example.pcm;
			wav.scp, kaldi style wav list (wav_id \t wav_path)

			Required: --model-dir <string>
			Required: --model-dir <string> --wav-path <string>

			For example:
			./funasr-onnx-offline-vad \
			@@ -107,17 +107,17 @@

			### funasr-onnx-offline-punc
			```shell
			./funasr-onnx-offline-punc [--txt-path <string>] [--quantize <string>]
			--model-dir <string> [--] [--version] [-h]
			./funasr-onnx-offline-punc --model-dir <string> [--quantize <string>]
			--txt-path <string> [--] [--version] [-h]
			Where:
			--model-dir <string>
			(required) the punc model path, which contains model.onnx, punc.yaml
			--quantize <string>
			false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
			--txt-path <string>
			txt file path, one sentence per line
			(required) txt file path, one sentence per line

			Required: --model-dir <string>
			Required: --model-dir <string> --txt-path <string>

			For example:
			./funasr-onnx-offline-punc \
			@@ -126,8 +126,8 @@
			```
			### funasr-onnx-offline-rtf
			```shell
			./funasr-onnx-offline-rtf --thread-num <int32_t> --wav-scp <string>
			[--quantize <string>] --model-dir <string>
			./funasr-onnx-offline-rtf --model-dir <string> [--quantize <string>]
			--wav-path <string> --thread-num <int32_t>
			[--] [--version] [-h]
			Where:
			--thread-num <int32_t>
			@@ -136,14 +136,17 @@
			(required) the model path, which contains model.onnx, config.yaml, am.mvn
			--quantize <string>
			false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
			--wav-scp <string>
			(required) wave scp path
			--wav-path <string>
			(required) the input could be:
			wav_path, e.g.: asr_example.wav;
			pcm_path, e.g.: asr_example.pcm;
			wav.scp, kaldi style wav list (wav_id \t wav_path)

			For example:
			./funasr-onnx-offline-rtf \
			--model-dir ./asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
			--quantize true \
			--wav-scp ./aishell1_test.scp \
			--wav-path ./aishell1_test.scp \
			--thread-num 32
			```

			@@ -225,7 +225,7 @@
			}
			}

			std::vector<std::vector<float>> &FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
			void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {

			std::vector<std::vector<float>> out_feats;
			int T = vad_feats.size();
			@@ -264,7 +264,6 @@
			}
			}
			vad_feats = out_feats;
			return vad_feats;
			}

			std::vector<std::vector<int>>
			@@ -272,7 +271,7 @@
			std::vector<std::vector<float>> vad_feats;
			std::vector<std::vector<float>> vad_probs;
			FbankKaldi(vad_sample_rate_, vad_feats, waves);
			vad_feats = LfrCmvn(vad_feats);
			LfrCmvn(vad_feats);
			Forward(vad_feats, &vad_probs);

			E2EVadModel vad_scorer = E2EVadModel();

			@@ -36,7 +36,7 @@
			void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
			const std::vector<float> &waves);

			std::vector<std::vector<float>> &LfrCmvn(std::vector<std::vector<float>> &vad_feats);
			void LfrCmvn(std::vector<std::vector<float>> &vad_feats);

			void Forward(
			const std::vector<std::vector<float>> &chunk_feats,

			@@ -36,7 +36,7 @@
			TCLAP::CmdLine cmd("funasr-onnx-offline-punc", ' ', "1.0");
			TCLAP::ValueArg<std::string> model_dir("", MODEL_DIR, "the punc model path, which contains model.onnx, punc.yaml", true, "", "string");
			TCLAP::ValueArg<std::string> quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
			TCLAP::ValueArg<std::string> txt_path("", TXT_PATH, "txt file path, one sentence per line", false, "", "string");
			TCLAP::ValueArg<std::string> txt_path("", TXT_PATH, "txt file path, one sentence per line", true, "", "string");

			cmd.add(model_dir);
			cmd.add(quantize);

			@@ -39,7 +39,7 @@
			// warm up
			for (size_t i = 0; i < 1; i++)
			{
			FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL);
			FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, 16000);
			}

			while (true) {
			@@ -50,7 +50,7 @@
			}

			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL);
			FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, 16000);

			gettimeofday(&end, NULL);
			seconds = (end.tv_sec - start.tv_sec);
			@@ -77,6 +77,15 @@
			}
			}

			bool is_target_file(const std::string& filename, const std::string target) {
			std::size_t pos = filename.find_last_of(".");
			if (pos == std::string::npos) {
			return false;
			}
			std::string extension = filename.substr(pos + 1);
			return (extension == target);
			}

			void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
			{
			if (value_arg.isSet()){
			@@ -94,19 +103,19 @@
			TCLAP::ValueArg<std::string> model_dir("", MODEL_DIR, "the model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string");
			TCLAP::ValueArg<std::string> quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");

			TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", true, "", "string");
			TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");
			TCLAP::ValueArg<std::int32_t> thread_num("", THREAD_NUM, "multi-thread num for rtf", true, 0, "int32_t");

			cmd.add(model_dir);
			cmd.add(quantize);
			cmd.add(wav_scp);
			cmd.add(wav_path);
			cmd.add(thread_num);
			cmd.parse(argc, argv);

			std::map<std::string, std::string> model_path;
			GetValue(model_dir, MODEL_DIR, model_path);
			GetValue(quantize, QUANTIZE, model_path);
			GetValue(wav_scp, WAV_SCP, model_path);
			GetValue(wav_path, WAV_PATH, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			@@ -125,10 +134,14 @@

			// read wav_scp
			vector<string> wav_list;
			if(model_path.find(WAV_SCP)!=model_path.end()){
			ifstream in(model_path.at(WAV_SCP));
			string wav_path_ = model_path.at(WAV_PATH);
			if(is_target_file(wav_path_, "wav") \|\| is_target_file(wav_path_, "pcm")){
			wav_list.emplace_back(wav_path_);
			}
			else if(is_target_file(wav_path_, "scp")){
			ifstream in(wav_path_);
			if (!in.is_open()) {
			LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP);
			LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
			return 0;
			}
			string line;
			@@ -140,6 +153,9 @@
			wav_list.emplace_back(column2);
			}
			in.close();
			}else{
			LOG(ERROR)<<"Please check the wav extension!";
			exit(-1);
			}

			// 多线程测试

			@@ -21,6 +21,15 @@

			using namespace std;

			bool is_target_file(const std::string& filename, const std::string target) {
			std::size_t pos = filename.find_last_of(".");
			if (pos == std::string::npos) {
			return false;
			}
			std::string extension = filename.substr(pos + 1);
			return (extension == target);
			}

			void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
			{
			if (value_arg.isSet()){
			@@ -58,20 +67,17 @@
			TCLAP::ValueArg<std::string> model_dir("", MODEL_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", true, "", "string");
			TCLAP::ValueArg<std::string> quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");

			TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false, "", "string");
			TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false, "", "string");
			TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");

			cmd.add(model_dir);
			cmd.add(quantize);
			cmd.add(wav_path);
			cmd.add(wav_scp);
			cmd.parse(argc, argv);

			std::map<std::string, std::string> model_path;
			GetValue(model_dir, MODEL_DIR, model_path);
			GetValue(quantize, QUANTIZE, model_path);
			GetValue(wav_path, WAV_PATH, model_path);
			GetValue(wav_scp, WAV_SCP, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			@@ -89,14 +95,14 @@
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

			// read wav_path and wav_scp
			// read wav_path
			vector<string> wav_list;

			if(model_path.find(WAV_PATH)!=model_path.end()){
			wav_list.emplace_back(model_path.at(WAV_PATH));
			string wav_path_ = model_path.at(WAV_PATH);
			if(is_target_file(wav_path_, "wav") \|\| is_target_file(wav_path_, "pcm")){
			wav_list.emplace_back(wav_path_);
			}
			if(model_path.find(WAV_SCP)!=model_path.end()){
			ifstream in(model_path.at(WAV_SCP));
			else if(is_target_file(wav_path_, "scp")){
			ifstream in(wav_path_);
			if (!in.is_open()) {
			LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
			return 0;
			@@ -110,13 +116,16 @@
			wav_list.emplace_back(column2);
			}
			in.close();
			}else{
			LOG(ERROR)<<"Please check the wav extension!";
			exit(-1);
			}

			float snippet_time = 0.0f;
			long taking_micros = 0;
			for(auto& wav_file : wav_list){
			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FsmnVadWavFile(vad_hanlde, wav_file.c_str(), RASR_NONE, NULL);
			FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000);
			gettimeofday(&end, NULL);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -20,6 +20,15 @@

			using namespace std;

			bool is_target_file(const std::string& filename, const std::string target) {
			std::size_t pos = filename.find_last_of(".");
			if (pos == std::string::npos) {
			return false;
			}
			std::string extension = filename.substr(pos + 1);
			return (extension == target);
			}

			void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
			{
			if (value_arg.isSet()){
			@@ -41,8 +50,7 @@
			TCLAP::ValueArg<std::string> punc_dir("", PUNC_DIR, "the punc model path, which contains model.onnx, punc.yaml", false, "", "string");
			TCLAP::ValueArg<std::string> punc_quant("", PUNC_QUANT, "false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir", false, "false", "string");

			TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false, "", "string");
			TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false, "", "string");
			TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");

			cmd.add(model_dir);
			cmd.add(quantize);
			@@ -51,7 +59,6 @@
			cmd.add(punc_dir);
			cmd.add(punc_quant);
			cmd.add(wav_path);
			cmd.add(wav_scp);
			cmd.parse(argc, argv);

			std::map<std::string, std::string> model_path;
			@@ -62,7 +69,6 @@
			GetValue(punc_dir, PUNC_DIR, model_path);
			GetValue(punc_quant, PUNC_QUANT, model_path);
			GetValue(wav_path, WAV_PATH, model_path);
			GetValue(wav_scp, WAV_SCP, model_path);

			struct timeval start, end;
			gettimeofday(&start, NULL);
			@@ -80,14 +86,14 @@
			long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
			LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";

			// read wav_path and wav_scp
			// read wav_path
			vector<string> wav_list;

			if(model_path.find(WAV_PATH)!=model_path.end()){
			wav_list.emplace_back(model_path.at(WAV_PATH));
			string wav_path_ = model_path.at(WAV_PATH);
			if(is_target_file(wav_path_, "wav") \|\| is_target_file(wav_path_, "pcm")){
			wav_list.emplace_back(wav_path_);
			}
			if(model_path.find(WAV_SCP)!=model_path.end()){
			ifstream in(model_path.at(WAV_SCP));
			else if(is_target_file(wav_path_, "scp")){
			ifstream in(wav_path_);
			if (!in.is_open()) {
			LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
			return 0;
			@@ -101,13 +107,16 @@
			wav_list.emplace_back(column2);
			}
			in.close();
			}else{
			LOG(ERROR)<<"Please check the wav extension!";
			exit(-1);
			}

			float snippet_time = 0.0f;
			long taking_micros = 0;
			for(auto& wav_file : wav_list){
			gettimeofday(&start, NULL);
			FUNASR_RESULT result=FunOfflineRecogFile(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL);
			FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, 16000);
			gettimeofday(&end, NULL);
			seconds = (end.tv_sec - start.tv_sec);
			taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);

			@@ -11,9 +11,9 @@
			return mm;
			}

			_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num)
			_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode)
			{
			funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num);
			funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num, mode);
			return mm;
			}

			@@ -30,36 +30,7 @@
			}

			// APIs for ASR Infer
			_FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			{
			funasr::Model* recog_obj = (funasr::Model*)handle;
			if (!recog_obj)
			return nullptr;

			int32_t sampling_rate = -1;
			funasr::Audio audio(1);
			if (!audio.LoadWav(sz_buf, n_len, &sampling_rate))
			return nullptr;

			float* buff;
			int len;
			int flag=0;
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			int n_step = 0;
			int n_total = audio.GetQueueSize();
			while (audio.Fetch(buff, len, flag) > 0) {
			string msg = recog_obj->Forward(buff, len, flag);
			p_result->msg += msg;
			n_step++;
			if (fn_callback)
			fn_callback(n_step, n_total);
			}

			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			_FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::Model* recog_obj = (funasr::Model*)handle;
			if (!recog_obj)
			@@ -87,23 +58,32 @@
			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			_FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::Model* recog_obj = (funasr::Model*)handle;
			if (!recog_obj)
			return nullptr;

			funasr::Audio audio(1);
			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
			return nullptr;
			if(funasr::is_target_file(sz_filename, "wav")){
			int32_t sampling_rate_ = -1;
			if(!audio.LoadWav(sz_filename, &sampling_rate_))
			return nullptr;
			}else if(funasr::is_target_file(sz_filename, "pcm")){
			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
			return nullptr;
			}else{
			LOG(ERROR)<<"Wrong wav extension";
			exit(-1);
			}

			float* buff;
			int len;
			int flag = 0;
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			int n_step = 0;
			int n_total = audio.GetQueueSize();
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			while (audio.Fetch(buff, len, flag) > 0) {
			string msg = recog_obj->Forward(buff, len, flag);
			p_result->msg += msg;
			@@ -115,46 +95,45 @@
			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			{
			funasr::Model* recog_obj = (funasr::Model*)handle;
			if (!recog_obj)
			return nullptr;

			int32_t sampling_rate = -1;
			funasr::Audio audio(1);
			if(!audio.LoadWav(sz_wavfile, &sampling_rate))
			return nullptr;

			float* buff;
			int len;
			int flag = 0;
			int n_step = 0;
			int n_total = audio.GetQueueSize();
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			while (audio.Fetch(buff, len, flag) > 0) {
			string msg = recog_obj->Forward(buff, len, flag);
			p_result->msg+= msg;
			n_step++;
			if (fn_callback)
			fn_callback(n_step, n_total);
			}

			return p_result;
			}

			// APIs for VAD Infer
			_FUNASRAPI FUNASR_RESULT FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
			if (!vad_obj)
			return nullptr;

			int32_t sampling_rate = -1;

			funasr::Audio audio(1);
			if(!audio.LoadWav(sz_wavfile, &sampling_rate))
			if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
			return nullptr;

			funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
			p_result->snippet_time = audio.GetTimeLen();

			vector<std::vector<int>> vad_segments;
			audio.Split(vad_obj, vad_segments);
			p_result->segments = new vector<std::vector<int>>(vad_segments);

			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
			if (!vad_obj)
			return nullptr;

			funasr::Audio audio(1);
			if(funasr::is_target_file(sz_filename, "wav")){
			int32_t sampling_rate_ = -1;
			if(!audio.LoadWav(sz_filename, &sampling_rate_))
			return nullptr;
			}else if(funasr::is_target_file(sz_filename, "pcm")){
			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
			return nullptr;
			}else{
			LOG(ERROR)<<"Wrong wav extension";
			exit(-1);
			}

			funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			@@ -178,43 +157,7 @@
			}

			// APIs for Offline-stream Infer
			_FUNASRAPI FUNASR_RESULT FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			{
			funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
			if (!offline_stream)
			return nullptr;

			int32_t sampling_rate = -1;
			funasr::Audio audio(1);
			if(!audio.LoadWav(sz_wavfile, &sampling_rate))
			return nullptr;
			if(offline_stream->UseVad()){
			audio.Split(offline_stream);
			}

			float* buff;
			int len;
			int flag = 0;
			int n_step = 0;
			int n_total = audio.GetQueueSize();
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			while (audio.Fetch(buff, len, flag) > 0) {
			string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
			p_result->msg+= msg;
			n_step++;
			if (fn_callback)
			fn_callback(n_step, n_total);
			}
			if(offline_stream->UsePunc()){
			string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
			p_result->msg = punc_res;
			}

			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
			_FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
			if (!offline_stream)
			@@ -249,6 +192,50 @@
			return p_result;
			}

			_FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
			{
			funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
			if (!offline_stream)
			return nullptr;

			funasr::Audio audio(1);
			if(funasr::is_target_file(sz_filename, "wav")){
			int32_t sampling_rate_ = -1;
			if(!audio.LoadWav(sz_filename, &sampling_rate_))
			return nullptr;
			}else if(funasr::is_target_file(sz_filename, "pcm")){
			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
			return nullptr;
			}else{
			LOG(ERROR)<<"Wrong wav extension";
			exit(-1);
			}
			if(offline_stream->UseVad()){
			audio.Split(offline_stream);
			}

			float* buff;
			int len;
			int flag = 0;
			int n_step = 0;
			int n_total = audio.GetQueueSize();
			funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
			p_result->snippet_time = audio.GetTimeLen();
			while (audio.Fetch(buff, len, flag) > 0) {
			string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
			p_result->msg+= msg;
			n_step++;
			if (fn_callback)
			fn_callback(n_step, n_total);
			}
			if(offline_stream->UsePunc()){
			string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
			p_result->msg = punc_res;
			}

			return p_result;
			}

			_FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result)
			{
			if (!result)

			@@ -180,4 +180,13 @@
			}
			}

			bool is_target_file(const std::string& filename, const std::string target) {
			std::size_t pos = filename.find_last_of(".");
			if (pos == std::string::npos) {
			return false;
			}
			std::string extension = filename.substr(pos + 1);
			return (extension == target);
			}

			} // namespace funasr

			@@ -25,6 +25,7 @@
			extern void Glu(Tensor<float> din, Tensor<float> dout);

			string PathAppend(const string &p1, const string &p2);
			bool is_target_file(const std::string& filename, const std::string target);

			} // namespace funasr
			#endif

			@@ -1,10 +1,14 @@
			#include "precomp.h"

			namespace funasr {
			VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num)
			VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode)
			{
			VadModel *mm;
			mm = new FsmnVad();
			if(mode == FSMN_VAD_OFFLINE){
			mm = new FsmnVad();
			}else{
			LOG(ERROR)<<"Online fsmn vad not imp!";
			}

			string vad_model_path;
			string vad_cmvn_path;

			@@ -25,8 +25,8 @@
			if (!buffer.empty()) {
			// fout.write(buffer.data(), buffer.size());
			// feed data to asr engine
			FUNASR_RESULT Result = FunOfflineRecogPCMBuffer(
			asr_hanlde, buffer.data(), buffer.size(), 16000, RASR_NONE, NULL);
			FUNASR_RESULT Result = FunOfflineInferBuffer(
			asr_hanlde, buffer.data(), buffer.size(), RASR_NONE, NULL, 16000);

			std::string asr_result =
			((FUNASR_RECOG_RESULT*)Result)->msg; // get decode result

			@@ -80,6 +80,7 @@


			def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed):
			punc_list = ['，', '。', '？', '、']
			res = []
			if text_postprocessed is None:
			return res
			@@ -124,34 +125,8 @@
			punc_id = int(punc_id) if punc_id is not None else 1
			sentence_end = time_stamp[1] if time_stamp is not None else sentence_end

			if punc_id == 2:
			sentence_text += ','
			res.append({
			'text': sentence_text,
			"start": sentence_start,
			"end": sentence_end,
			"text_seg": sentence_text_seg,
			"ts_list": ts_list
			})
			sentence_text = ''
			sentence_text_seg = ''
			ts_list = []
			sentence_start = sentence_end
			elif punc_id == 3:
			sentence_text += '.'
			res.append({
			'text': sentence_text,
			"start": sentence_start,
			"end": sentence_end,
			"text_seg": sentence_text_seg,
			"ts_list": ts_list
			})
			sentence_text = ''
			sentence_text_seg = ''
			ts_list = []
			sentence_start = sentence_end
			elif punc_id == 4:
			sentence_text += '?'
			if punc_id > 1:
			sentence_text += punc_list[punc_id - 2]
			res.append({
			'text': sentence_text,
			"start": sentence_start,