From 584cfbdc433cfb3d7852868db83060b6d9aa0edf Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: 星期一, 15 七月 2024 18:43:19 +0800
Subject: [PATCH] Add triton server for SenseVoice (#1901)

---
 runtime/triton_gpu/model_repo_sense_voice_small/scoring/config.pbtxt                         |   59 +++
 runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/1/.gitkeep                        |    0 
 runtime/triton_gpu/docker-compose.yml                                                        |   18 
 runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt                  |    1 
 runtime/triton_gpu/model_repo_paraformer_large_online/encoder/config.pbtxt                   |    2 
 runtime/triton_gpu/model_repo_sense_voice_small/scoring/1/model.py                           |  136 +++++++
 runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml                |   97 +++++
 runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/config.pbtxt                      |  117 ++++++
 runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/am.mvn                     |    8 
 runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/1/model.py                 |  325 +++++++++++++++++
 runtime/triton_gpu/README.md                                                                 |   98 ++--
 runtime/triton_gpu/README_paraformer_online.md                                               |    0 
 runtime/triton_gpu/Dockerfile/Dockerfile.sensevoice                                          |   22 +
 runtime/triton_gpu/model_repo_sense_voice_small/encoder/config.pbtxt                         |   71 +++
 runtime/triton_gpu/model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model |    1 
 runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/.gitkeep                           |    0 
 runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.pbtxt               |   81 ++++
 runtime/triton_gpu/README_paraformer_offline.md                                              |   85 ++++
 runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/model.onnx                         |    1 
 19 files changed, 1,071 insertions(+), 51 deletions(-)

diff --git a/runtime/triton_gpu/Dockerfile/Dockerfile.sensevoice b/runtime/triton_gpu/Dockerfile/Dockerfile.sensevoice
new file mode 100644
index 0000000..d20d61e
--- /dev/null
+++ b/runtime/triton_gpu/Dockerfile/Dockerfile.sensevoice
@@ -0,0 +1,22 @@
+FROM nvcr.io/nvidia/tritonserver:24.05-py3
+# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
+# Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue
+
+LABEL maintainer="NVIDIA"
+LABEL repository="tritonserver"
+
+RUN pip install torch
+RUN apt-get update  && apt-get -y install cmake
+
+WORKDIR /workspace
+RUN pip install -U "huggingface_hub[cli]" tritonclient[all] soundfile pyyaml torchaudio sentencepiece
+
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
+RUN git clone https://github.com/csukuangfj/kaldifeat && \
+    cd kaldifeat && \
+    sed -i 's/in running_cuda_version//g' get_version.py && \
+    python3 setup.py install && \
+    cd -
+
+RUN huggingface-cli download --local-dir ./model_repo_sense_voice_small yuekai/model_repo_sense_voice_small
+RUN rm -r ./model_repo_sense_voice_small/.huggingface
\ No newline at end of file
diff --git a/runtime/triton_gpu/README.md b/runtime/triton_gpu/README.md
index 48e889c..36bb8f6 100644
--- a/runtime/triton_gpu/README.md
+++ b/runtime/triton_gpu/README.md
@@ -1,85 +1,81 @@
-## Inference with Triton 
+## Triton Inference Serving Best Practice for SenseVoice
 
-### Steps:
-1. Prepare model repo files
+### Quick Start
+Directly launch the service using docker compose.
 ```sh
-git-lfs install
-git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
-
-pretrained_model_dir=$(pwd)/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-
-cp $pretrained_model_dir/am.mvn ./model_repo_paraformer_large_offline/feature_extractor/
-cp $pretrained_model_dir/config.yaml ./model_repo_paraformer_large_offline/feature_extractor/
-
-# Refer here to get model.onnx (https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/export/README.md)
-cp <exported_onnx_dir>/model.onnx ./model_repo_paraformer_large_offline/encoder/1/
+docker compose up --build
 ```
+
+### Build Image
+Build the docker image from scratch. 
+```sh
+# build from scratch, cd to the parent dir of Dockerfile.server
+docker build . -f Dockerfile/Dockerfile.sensevoice -t soar97/triton-sensevoice:24.05
+```
+
+### Create Docker Container
+```sh
+your_mount_dir=/mnt:/mnt
+docker run -it --name "sensevoice-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-sensevoice:24.05
+```
+
+### Export SenseVoice Model to Onnx
+Please follow the official guide of FunASR to export the sensevoice onnx file. Also, you need to download the tokenizer file by yourself. 
+### Launch Server
 Log of directory tree:
 ```sh
-model_repo_paraformer_large_offline/
+model_repo_sense_voice_small
 |-- encoder
 |   |-- 1
-|   |   `-- model.onnx
+|   |   `-- model.onnx -> /your/path/model.onnx
 |   `-- config.pbtxt
 |-- feature_extractor
 |   |-- 1
 |   |   `-- model.py
-|   |-- config.pbtxt
 |   |-- am.mvn
+|   |-- config.pbtxt
 |   `-- config.yaml
-|-- infer_pipeline
+|-- scoring
 |   |-- 1
+|   |   `-- model.py
+|   |-- chn_jpn_yue_eng_ko_spectok.bpe.model -> /your/path/chn_jpn_yue_eng_ko_spectok.bpe.model
 |   `-- config.pbtxt
-`-- scoring
+`-- sensevoice
     |-- 1
-    |   `-- model.py
     `-- config.pbtxt
 
-8 directories, 9 files
-```
+8 directories, 10 files
 
-2. Follow below instructions to launch triton server
-```sh
-# using docker image Dockerfile/Dockerfile.server
-docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01 
-docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_offline>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01 
 
 # launch the service 
-tritonserver --model-repository /workspace/model_repo_paraformer_large_offline \
+tritonserver --model-repository /workspace/model_repo_sensevoice_small \
              --pinned-memory-pool-byte-size=512000000 \
              --cuda-memory-pool-byte-size=0:1024000000
-
 ```
 
-### Performance benchmark
 
-Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
-
+### Benchmark using Dataset
 ```sh
-# For client container:
-docker run -it --rm --name "client_test" --net host --gpus all -v <path_host/triton_gpu/client>:/workpace/ soar97/triton-k2:22.12.1 # noqa
-# For aishell manifests:
-apt-get install git-lfs
-git-lfs install
-git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
-sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell
-tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
-
-serveraddr=localhost
-manifest_path=/workspace/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz
-num_task=60
-python3 client/decode_manifest_triton.py \
-    --server-addr $serveraddr \
+git clone https://github.com/yuekaizhang/Triton-ASR-Client.git
+cd Triton-ASR-Client
+num_task=32
+python3 client.py \
+    --server-addr localhost \
+    --server-port 10086 \
+    --model-name sensevoice \
     --compute-cer \
-    --model-name infer_pipeline \
     --num-tasks $num_task \
-    --manifest-filename $manifest_path
+    --batch-size 16 \
+    --manifest-dir ./datasets/aishell1_test
 ```
 
-(Note: The service has been fully warm up.)
-|concurrent-tasks | processing time(s) | RTF |
-|----------|--------------------|------------|
-| 60 (onnx fp32)                | 116.0 | 0.0032|
+Benchmark results below were based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
+|concurrent-tasks | batch-size-per-task | processing time(s) | RTF |
+|----------|--------------------|------------|---------------------|
+| 32 (onnx fp32)                | 16 | 67.09 | 0.0019|
+| 32 (onnx fp32)                | 1 | 82.04  | 0.0023|
+
+(Note: for batch-size-per-task=1 cases, tritonserver could use dynamic batching to improve throughput.)
 
 ## Acknowledge
 This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.
diff --git a/runtime/triton_gpu/README_paraformer_offline.md b/runtime/triton_gpu/README_paraformer_offline.md
new file mode 100644
index 0000000..48e889c
--- /dev/null
+++ b/runtime/triton_gpu/README_paraformer_offline.md
@@ -0,0 +1,85 @@
+## Inference with Triton 
+
+### Steps:
+1. Prepare model repo files
+```sh
+git-lfs install
+git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
+
+pretrained_model_dir=$(pwd)/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+
+cp $pretrained_model_dir/am.mvn ./model_repo_paraformer_large_offline/feature_extractor/
+cp $pretrained_model_dir/config.yaml ./model_repo_paraformer_large_offline/feature_extractor/
+
+# Refer here to get model.onnx (https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/export/README.md)
+cp <exported_onnx_dir>/model.onnx ./model_repo_paraformer_large_offline/encoder/1/
+```
+Log of directory tree:
+```sh
+model_repo_paraformer_large_offline/
+|-- encoder
+|   |-- 1
+|   |   `-- model.onnx
+|   `-- config.pbtxt
+|-- feature_extractor
+|   |-- 1
+|   |   `-- model.py
+|   |-- config.pbtxt
+|   |-- am.mvn
+|   `-- config.yaml
+|-- infer_pipeline
+|   |-- 1
+|   `-- config.pbtxt
+`-- scoring
+    |-- 1
+    |   `-- model.py
+    `-- config.pbtxt
+
+8 directories, 9 files
+```
+
+2. Follow below instructions to launch triton server
+```sh
+# using docker image Dockerfile/Dockerfile.server
+docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01 
+docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_offline>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01 
+
+# launch the service 
+tritonserver --model-repository /workspace/model_repo_paraformer_large_offline \
+             --pinned-memory-pool-byte-size=512000000 \
+             --cuda-memory-pool-byte-size=0:1024000000
+
+```
+
+### Performance benchmark
+
+Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
+
+```sh
+# For client container:
+docker run -it --rm --name "client_test" --net host --gpus all -v <path_host/triton_gpu/client>:/workpace/ soar97/triton-k2:22.12.1 # noqa
+# For aishell manifests:
+apt-get install git-lfs
+git-lfs install
+git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell
+tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
+
+serveraddr=localhost
+manifest_path=/workspace/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz
+num_task=60
+python3 client/decode_manifest_triton.py \
+    --server-addr $serveraddr \
+    --compute-cer \
+    --model-name infer_pipeline \
+    --num-tasks $num_task \
+    --manifest-filename $manifest_path
+```
+
+(Note: The service has been fully warm up.)
+|concurrent-tasks | processing time(s) | RTF |
+|----------|--------------------|------------|
+| 60 (onnx fp32)                | 116.0 | 0.0032|
+
+## Acknowledge
+This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.
diff --git a/runtime/triton_gpu/README_ONLINE.md b/runtime/triton_gpu/README_paraformer_online.md
old mode 100755
new mode 100644
similarity index 100%
rename from runtime/triton_gpu/README_ONLINE.md
rename to runtime/triton_gpu/README_paraformer_online.md
diff --git a/runtime/triton_gpu/docker-compose.yml b/runtime/triton_gpu/docker-compose.yml
new file mode 100644
index 0000000..4d35b88
--- /dev/null
+++ b/runtime/triton_gpu/docker-compose.yml
@@ -0,0 +1,18 @@
+services:
+  asr:
+    image: soar97/triton-sensevoice:24.05
+    ports:
+      - "10085:8000"
+      - "10086:8001"
+      - "10087:8002"
+    environment:
+      - PYTHONIOENCODING=utf-8
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
+    command: >
+      /bin/bash -c "cd ./model_repo_sense_voice_small && bash run.sh"
\ No newline at end of file
diff --git a/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt b/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt
index 3cb657e..9fa7e62 100644
--- a/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt
+++ b/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt
@@ -51,6 +51,7 @@
     max_queue_delay_microseconds: 500
   }
 
+parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
 
 instance_group [
     {
diff --git a/runtime/triton_gpu/model_repo_paraformer_large_online/encoder/config.pbtxt b/runtime/triton_gpu/model_repo_paraformer_large_online/encoder/config.pbtxt
index 3e54df1..2d13c10 100755
--- a/runtime/triton_gpu/model_repo_paraformer_large_online/encoder/config.pbtxt
+++ b/runtime/triton_gpu/model_repo_paraformer_large_online/encoder/config.pbtxt
@@ -69,6 +69,8 @@
   }
 ]
 
+parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
+
 instance_group [
     {
       count: 1
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/.gitkeep b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/.gitkeep
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/model.onnx b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/model.onnx
new file mode 120000
index 0000000..a96cd8e
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/1/model.onnx
@@ -0,0 +1 @@
+/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoice/model.onnx
\ No newline at end of file
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/encoder/config.pbtxt b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/config.pbtxt
new file mode 100644
index 0000000..af95732
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/encoder/config.pbtxt
@@ -0,0 +1,71 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "encoder"
+backend: "onnxruntime"
+default_model_filename: "model.onnx"
+
+max_batch_size: 16
+
+input [
+  {
+    name: "speech"
+    data_type: TYPE_FP32
+    dims: [-1, 560]
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "language"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "textnorm"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+
+output [
+  {
+    name: "ctc_logits"
+    data_type: TYPE_FP32
+    dims: [-1, 25055] 
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+
+dynamic_batching {
+  }
+parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
+
+instance_group [
+    {
+      count: 1
+      kind: KIND_GPU
+    }
+]
+
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/1/model.py b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/1/model.py
new file mode 100644
index 0000000..21c2d72
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/1/model.py
@@ -0,0 +1,325 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import triton_python_backend_utils as pb_utils
+from torch.utils.dlpack import to_dlpack
+import torch
+import numpy as np
+import kaldifeat
+import _kaldifeat
+from typing import List
+import json
+import yaml
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+
+class LFR(torch.nn.Module):
+    """Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
+
+    def __init__(self, m: int = 7, n: int = 6) -> None:
+        """
+        Actually, this implements stacking frames and skipping frames.
+        if m = 1 and n = 1, just return the origin features.
+        if m = 1 and n > 1, it works like skipping.
+        if m > 1 and n = 1, it works like stacking but only support right frames.
+        if m > 1 and n > 1, it works like LFR.
+        """
+        super().__init__()
+
+        self.m = m
+        self.n = n
+
+        self.left_padding_nums = math.ceil((self.m - 1) // 2)
+
+    def forward(
+        self, input_tensor: torch.Tensor, input_lens: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        B, _, D = input_tensor.size()
+        n_lfr = torch.ceil(input_lens / self.n)
+
+        prepad_nums = input_lens + self.left_padding_nums
+
+        right_padding_nums = torch.where(
+            self.m >= (prepad_nums - self.n * (n_lfr - 1)),
+            self.m - (prepad_nums - self.n * (n_lfr - 1)),
+            0,
+        )
+
+        T_all = self.left_padding_nums + input_lens + right_padding_nums
+
+        new_len = T_all // self.n
+
+        T_all_max = T_all.max().int()
+
+        tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D)  # [B,1,D]
+
+        tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
+        tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
+        head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
+
+        # stack
+        input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
+
+        index = (
+            torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
+            .unsqueeze(0)
+            .repeat(B, 1)
+        )  # [B, T_all_max]
+        index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1)  # [B, T_all_max]
+
+        tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
+        tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
+            0
+        ).repeat(B, 1) * (
+            T_all_max - 1
+        )  # [B, T_all_max]
+        indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
+        input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
+
+        input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
+
+        return input_tensor.reshape(B, -1, D * self.m), new_len
+
+
+class WavFrontend:
+    """Conventional frontend structure for ASR."""
+
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        filter_length_min: int = -1,
+        filter_length_max: float = -1,
+        lfr_m: int = 7,
+        lfr_n: int = 6,
+        dither: float = 1.0,
+    ) -> None:
+
+        self.fs = fs
+        self.window = window
+        self.n_mels = n_mels
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.lfr = LFR(lfr_m, lfr_n)
+        self.cmvn_file = cmvn_file
+        self.dither = dither
+
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+
+    def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        batch, frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+
+        means = torch.from_numpy(means).to(inputs.device)
+        vars = torch.from_numpy(vars).to(inputs.device)
+
+        inputs = (inputs + means) * vars
+        return inputs
+
+    def load_cmvn(
+        self,
+    ) -> np.ndarray:
+        with open(self.cmvn_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == "<AddShift>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    add_shift_line = line_item[3 : (len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == "<Rescale>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    rescale_line = line_item[3 : (len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
+
+
+class Fbank(torch.nn.Module):
+    def __init__(self, opts):
+        super(Fbank, self).__init__()
+        self.fbank = kaldifeat.Fbank(opts)
+
+    def forward(self, waves: List[torch.Tensor]):
+        return self.fbank(waves)
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args["model_config"])
+        self.max_batch_size = max(model_config["max_batch_size"], 1)
+        self.device = "cuda"
+
+        # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
+        # Convert Triton types to numpy types
+        output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
+
+        if output0_dtype == np.float32:
+            self.output0_dtype = torch.float32
+        else:
+            self.output0_dtype = torch.float16
+
+        # Get OUTPUT1 configuration
+        output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
+        # Convert Triton types to numpy types
+        self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
+
+        params = self.model_config["parameters"]
+
+        for li in params.items():
+            key, value = li
+            value = value["string_value"]
+            if key == "config_path":
+                with open(str(value), "rb") as f:
+                    config = yaml.load(f, Loader=yaml.Loader)
+            if key == "cmvn_path":
+                cmvn_path = str(value)
+        config["frontend_conf"]["cmvn_file"] = cmvn_path
+
+        opts = kaldifeat.FbankOptions()
+        opts.frame_opts.dither = 1.0  # TODO: 0.0 or 1.0
+        opts.frame_opts.window_type = config["frontend_conf"]["window"]
+        opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
+        opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
+        opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
+        opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
+        opts.device = torch.device(self.device)
+        self.opts = opts
+        self.feature_extractor = Fbank(self.opts)
+        self.feature_size = opts.mel_opts.num_bins
+
+        self.frontend = WavFrontend(**config["frontend_conf"])
+
+    def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
+        feats, feats_len = [], []
+        wavs = []
+        for waveform in waveform_list:
+            wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
+            wavs.append(wav)
+
+        features = self.feature_extractor(wavs)
+        features_len = [feature.shape[0] for feature in features]
+        speech = torch.zeros(
+            (len(features), max(features_len), self.opts.mel_opts.num_bins),
+            dtype=self.output0_dtype,
+            device=self.device,
+        )
+        for i, feature in enumerate(features):
+            speech[i, : int(features_len[i])] = feature
+        speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
+
+        feats, feats_len = self.frontend.lfr(speech, speech_lens)
+        feats_len = feats_len.type(torch.int32)
+
+        feats = self.frontend.apply_cmvn_batch(feats)
+        feats = feats.type(self.output0_dtype)
+
+        return feats, feats_len
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        batch_count = []
+        total_waves = []
+        batch_len = []
+        responses = []
+        for request in requests:
+
+            input0 = pb_utils.get_input_tensor_by_name(request, "wav")
+            input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
+
+            cur_b_wav = input0.as_numpy() * (1 << 15)  # b x -1
+            # remove paddings, however, encoder may can't batch requests since different lengths.
+            # cur_b_wav = cur_b_wav[:, : int(input1.as_numpy()[0])]
+            batch_count.append(cur_b_wav.shape[0])
+
+            # convert the bx-1 numpy array into a 1x-1 list of arrays
+            cur_b_wav_list = [np.expand_dims(cur_b_wav[i], 0) for i in range(cur_b_wav.shape[0])]
+            total_waves.extend(cur_b_wav_list)
+
+        features, feats_len = self.extract_feat(total_waves)
+
+        i = 0
+        for batch in batch_count:
+            speech = features[i : i + batch]
+            speech_lengths = feats_len[i : i + batch].unsqueeze(1)
+
+            speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
+
+            out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
+            out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
+            responses.append(inference_response)
+            i += batch
+
+        return responses
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/am.mvn b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/am.mvn
new file mode 100644
index 0000000..681910c
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/am.mvn
@@ -0,0 +1,8 @@
+<Nnet> 
+<Splice> 560 560
+[ 0 ]
+<AddShift> 560 560 
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 560 560
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet> 
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.pbtxt b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.pbtxt
new file mode 100644
index 0000000..7f2d789
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.pbtxt
@@ -0,0 +1,81 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "feature_extractor"
+backend: "python"
+max_batch_size: 16
+
+parameters [
+  {
+    key: "num_mel_bins",
+    value: { string_value: "80"}
+  },
+  {
+    key: "frame_shift_in_ms"
+    value: { string_value: "10"}
+  },
+  {
+    key: "frame_length_in_ms"
+    value: { string_value: "25"}
+  },
+  {
+    key: "sample_rate"
+    value: { string_value: "16000"}
+  },
+  {
+    key: "cmvn_path"
+    value: { string_value: "./model_repo_sense_voice_small/feature_extractor/am.mvn"}
+  },
+  {
+    key: "config_path"
+    value: { string_value: "./model_repo_sense_voice_small/feature_extractor/config.yaml"}
+  }
+
+]
+
+input [
+  {
+    name: "wav"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "wav_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+
+output [
+  {
+    name: "speech"
+    data_type: TYPE_FP32
+    dims: [-1, 560]  # 80
+  },
+  {
+    name: "speech_lengths"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+
+dynamic_batching {
+  }
+
+instance_group [
+    {
+      count: 2
+      kind: KIND_GPU
+    }
+]
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml
new file mode 100644
index 0000000..26bb9d3
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/feature_extractor/config.yaml
@@ -0,0 +1,97 @@
+encoder: SenseVoiceEncoderSmall
+encoder_conf:
+    output_size: 512
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 50
+    tp_blocks: 20
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: pe
+    pos_enc_class: SinusoidalPositionEncoder
+    normalize_before: true
+    kernel_size: 11
+    sanm_shfit: 0
+    selfattention_layer_type: sanm
+
+
+model: SenseVoiceSmall
+model_conf:
+    length_normalized_loss: true
+    sos: 1
+    eos: 2
+    ignore_id: -1
+
+tokenizer: SentencepiecesTokenizer
+tokenizer_conf:
+  bpemodel: null
+  unk_symbol: <unk>
+  split_with_space: true
+
+frontend: WavFrontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 7
+    lfr_n: 6
+    cmvn_file: null
+
+
+dataset: SenseVoiceCTCDataset
+dataset_conf:
+  index_ds: IndexDSJsonl
+  batch_sampler: EspnetStyleBatchSampler
+  data_split_num: 32
+  batch_type: token
+  batch_size: 14000
+  max_token_length: 2000
+  min_token_length: 60
+  max_source_length: 2000
+  min_source_length: 60
+  max_target_length: 200
+  min_target_length: 0
+  shuffle: true
+  num_workers: 4
+  sos: ${model_conf.sos}
+  eos: ${model_conf.eos}
+  IndexDSJsonl: IndexDSJsonl
+  retry: 20
+
+train_conf:
+  accum_grad: 1
+  grad_clip: 5
+  max_epoch: 20
+  keep_nbest_models: 10
+  avg_nbest_model: 10
+  log_interval: 100
+  resume: true
+  validate_interval: 10000
+  save_checkpoint_interval: 10000
+
+optim: adamw
+optim_conf:
+  lr: 0.00002
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+
+specaug: SpecAugLFR
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    lfr_rate: 6
+    num_freq_mask: 1
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 12
+    num_time_mask: 1
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/scoring/1/model.py b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/1/model.py
new file mode 100644
index 0000000..c60c8f6
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/1/model.py
@@ -0,0 +1,136 @@
+#!/bin/bash
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import triton_python_backend_utils as pb_utils
+import numpy as np
+import torch
+from torch.utils.dlpack import from_dlpack
+
+import json
+import os
+import yaml
+
+import sentencepiece as spm
+
+
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        self.model_config = model_config = json.loads(args["model_config"])
+        self.max_batch_size = max(model_config["max_batch_size"], 1)
+
+        # # Get OUTPUT0 configuration
+        output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
+        # # Convert Triton types to numpy types
+        self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
+
+        self.init_tokenizer(self.model_config["parameters"])
+
+    def init_tokenizer(self, parameters):
+        for li in parameters.items():
+            key, value = li
+            value = value["string_value"]
+            if key == "tokenizer_path":
+                tokenizer_path = value
+                self.tokenizer = spm.SentencePieceProcessor()
+                self.tokenizer.Load(tokenizer_path)
+
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        # Every Python backend must iterate through list of requests and create
+        # an instance of pb_utils.InferenceResponse class for each of them. You
+        # should avoid storing any of the input Tensors in the class attributes
+        # as they will be overridden in subsequent inference requests. You can
+        # make a copy of the underlying NumPy array and store it if it is
+        # required.
+
+        total_seq = 0
+        logits_list, batch_count = [], []
+
+        for request in requests:
+            # Perform inference on the request and append it to responses list...
+            in_0 = pb_utils.get_input_tensor_by_name(request, "ctc_logits")
+
+            logits = from_dlpack(in_0.to_dlpack())
+            logits_list.append(logits)
+
+            total_seq += logits.shape[0]
+            batch_count.append(logits.shape[0])
+
+        logits_batch = torch.cat(logits_list, dim=0)
+        yseq_batch = logits_batch.argmax(axis=-1)
+        yseq_batch = torch.unique_consecutive(yseq_batch, dim=-1)
+
+        yseq_batch = yseq_batch.tolist()
+
+        # Remove blank_id and EOS tokens
+        token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
+
+        hyps = []
+        for i, token_int in enumerate(token_int_batch):
+            hyp = self.tokenizer.DecodeIds(token_int)
+            hyps.append(hyp)
+
+        responses = []
+        i = 0
+        for batch in batch_count:
+            sents = np.array(hyps[i : i + batch])
+            out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
+            inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
+            responses.append(inference_response)
+            i += batch
+
+        return responses
+
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print("Cleaning up...")
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model
new file mode 120000
index 0000000..05c78e6
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model
@@ -0,0 +1 @@
+/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model
\ No newline at end of file
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/scoring/config.pbtxt b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/config.pbtxt
new file mode 100644
index 0000000..99e0a5e
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/scoring/config.pbtxt
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "scoring"
+backend: "python"
+max_batch_size: 16
+
+parameters [
+  {
+    key: "tokenizer_path",
+    value: { string_value: "./model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model"}
+  },
+  { key: "FORCE_CPU_ONLY_INPUT_TENSORS" 
+    value: {string_value:"no"}
+  }
+]
+
+
+input [
+  {
+    name: "ctc_logits"
+    data_type: TYPE_FP32
+    dims: [-1, 25055]
+  },
+  {
+    name: "encoder_out_lens"
+    data_type: TYPE_INT32
+    dims: [1]
+    reshape: { shape: [ ] }
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+
+dynamic_batching {
+  }
+instance_group [
+    {
+      count: 2
+      kind: KIND_CPU
+    }
+  ]
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/1/.gitkeep b/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/1/.gitkeep
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/1/.gitkeep
diff --git a/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/config.pbtxt b/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/config.pbtxt
new file mode 100644
index 0000000..5be03cf
--- /dev/null
+++ b/runtime/triton_gpu/model_repo_sense_voice_small/sensevoice/config.pbtxt
@@ -0,0 +1,117 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: "sensevoice"
+platform: "ensemble"
+max_batch_size: 16
+
+input [
+  {
+    name: "WAV"
+    data_type: TYPE_FP32
+    dims: [-1]
+  },
+  {
+    name: "WAV_LENS"
+    data_type: TYPE_INT32
+    dims: [1]
+  },
+  {
+    name: "LANGUAGE"
+    data_type: TYPE_INT32
+    dims: [1]
+  },
+  {
+    name: "TEXT_NORM"
+    data_type: TYPE_INT32
+    dims: [1]
+  }
+]
+
+output [
+  {
+    name: "TRANSCRIPTS"
+    data_type: TYPE_STRING
+    dims: [1]
+  }
+]
+
+ensemble_scheduling {
+ step [
+   {
+    model_name: "feature_extractor"
+    model_version: -1
+    input_map {
+      key: "wav"
+      value: "WAV"
+    }
+    input_map {
+      key: "wav_lens"
+      value: "WAV_LENS"
+    }
+    output_map {
+      key: "speech"
+      value: "SPEECH"
+    }
+    output_map {
+      key: "speech_lengths"
+      value: "SPEECH_LENGTHS"
+    }
+   },
+   {
+    model_name: "encoder"
+    model_version: -1
+    input_map {
+      key: "speech"
+      value: "SPEECH"
+    }
+    input_map {
+      key: "speech_lengths"
+      value: "SPEECH_LENGTHS"
+    }
+    input_map {
+      key: "language"
+      value: "LANGUAGE"
+    }
+    input_map {
+      key: "textnorm"
+      value: "TEXT_NORM"
+    }
+    output_map {
+      key: "ctc_logits"
+      value: "ctc_logits"
+    }
+    output_map {
+      key: "encoder_out_lens"
+      value: "encoder_out_lens"
+    }
+  },
+  {
+      model_name: "scoring"
+      model_version: -1
+      input_map {
+          key: "ctc_logits"
+          value: "ctc_logits"
+      }
+      input_map {
+          key: "encoder_out_lens"
+          value: "encoder_out_lens"
+      }
+      output_map {
+          key: "OUTPUT0"
+          value: "TRANSCRIPTS"
+      }
+  }
+ ]
+}

--
Gitblit v1.9.1