From 7edad6fba36a7527c1857a38b77a0277e8fde582 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 29 十月 2024 15:11:54 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR merge

---
 runtime/onnxruntime/include/funasrruntime.h                    |    3 
 funasr/models/fsmn_vad_streaming/model.py                      |   46 ++--
 runtime/onnxruntime/include/tpass-stream.h                     |    2 
 runtime/onnxruntime/src/paraformer.h                           |    3 
 runtime/readme_cn.md                                           |    1 
 runtime/docs/SDK_advanced_guide_online.md                      |    5 
 runtime/onnxruntime/src/sensevoice-small.h                     |   12 
 runtime/websocket/bin/websocket-server-2pass.h                 |    4 
 runtime/websocket/bin/websocket-server-2pass.cpp               |   26 ++
 README_zh.md                                                   |    1 
 runtime/onnxruntime/include/model.h                            |    4 
 runtime/onnxruntime/src/paraformer-online.h                    |   23 +
 runtime/onnxruntime/src/tpass-online-stream.cpp                |    2 
 runtime/readme.md                                              |    3 
 README.md                                                      |    1 
 runtime/onnxruntime/src/paraformer.cpp                         |   27 --
 runtime/websocket/bin/funasr-wss-client-2pass.cpp              |   33 ++-
 runtime/onnxruntime/src/sensevoice-small.cpp                   |  166 ++++++++++++++++
 runtime/onnxruntime/src/paraformer-online.cpp                  |  106 +++++++--
 runtime/docs/SDK_advanced_guide_online_zh.md                   |    9 
 runtime/onnxruntime/src/tpass-stream.cpp                       |   12 +
 runtime/onnxruntime/src/funasrruntime.cpp                      |   49 +++-
 fun_text_processing/inverse_text_normalization/run_evaluate.py |   33 +-
 runtime/websocket/bin/funasr-wss-server-2pass.cpp              |    6 
 24 files changed, 426 insertions(+), 151 deletions(-)

diff --git a/README.md b/README.md
index 5eb5d77..ee23086 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@
 
 <a name="whats-new"></a>
 ## What's new:
+- 2024/10/29: Real-time Transcription Service 1.12 released锛孴he 2pass-offline mode supports the SensevoiceSmal model锛�([docs](runtime/readme.md));
 - 2024/10/10锛欰dded support for the Whisper-large-v3-turbo model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the [modelscope](examples/industrial_data_pretraining/whisper/demo.py), and [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py).
 - 2024/09/26: Offline File Transcription Service 4.6, Offline File Transcription Service of English 1.7锛孯eal-time Transcription Service 1.11 released锛宖ix memory leak & Support the SensevoiceSmall onnx model锛汧ile Transcription Service 2.0 GPU released, Fix GPU memory leak; ([docs](runtime/readme.md));
 - 2024/09/25锛歬eyword spotting models are new supported. Supports fine-tuning and inference for four models: [fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online).
diff --git a/README_zh.md b/README_zh.md
index 5ae1169..b90f0e4 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -33,6 +33,7 @@
 
 <a name="鏈�鏂板姩鎬�"></a>
 ## 鏈�鏂板姩鎬�
+- 2024/10/29: 涓枃瀹炴椂璇煶鍚啓鏈嶅姟 1.12 鍙戝竷锛�2pass-offline妯″紡鏀寔SensevoiceSmall妯″瀷锛涜缁嗕俊鎭弬闃�([閮ㄧ讲鏂囨。](runtime/readme_cn.md))
 - 2024/10/10锛氭柊澧炲姞Whisper-large-v3-turbo妯″瀷鏀寔锛屽璇█璇煶璇嗗埆/缈昏瘧/璇璇嗗埆锛屾敮鎸佷粠 [modelscope](examples/industrial_data_pretraining/whisper/demo.py)浠撳簱涓嬭浇锛屼篃鏀寔浠� [openai](examples/industrial_data_pretraining/whisper/demo_from_openai.py)浠撳簱涓嬭浇妯″瀷銆�
 - 2024/09/26: 涓枃绂荤嚎鏂囦欢杞啓鏈嶅姟 4.6銆佽嫳鏂囩绾挎枃浠惰浆鍐欐湇鍔� 1.7銆佷腑鏂囧疄鏃惰闊冲惉鍐欐湇鍔� 1.11 鍙戝竷锛屼慨澶峅NNX鍐呭瓨娉勬紡銆佹敮鎸丼ensevoiceSmall onnx妯″瀷锛涗腑鏂囩绾挎枃浠惰浆鍐欐湇鍔PU 2.0 鍙戝竷锛屼慨澶嶆樉瀛樻硠婕�; 璇︾粏淇℃伅鍙傞槄([閮ㄧ讲鏂囨。](runtime/readme_cn.md))
 - 2024/09/25锛氭柊澧炶闊冲敜閱掓ā鍨嬶紝鏀寔[fsmn_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [fsmn_kws_mt](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online), [sanm_kws](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline), [sanm_kws_streaming](https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online) 4涓ā鍨嬬殑寰皟鍜屾帹鐞嗐��
diff --git a/fun_text_processing/inverse_text_normalization/run_evaluate.py b/fun_text_processing/inverse_text_normalization/run_evaluate.py
index 76e6e3c..bea92fa 100644
--- a/fun_text_processing/inverse_text_normalization/run_evaluate.py
+++ b/fun_text_processing/inverse_text_normalization/run_evaluate.py
@@ -9,16 +9,14 @@
     training_data_to_tokens,
 )
 
-
 """
 Runs Evaluation on data in the format of : <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
 like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
 """
 
-
 def parse_args():
     parser = ArgumentParser()
-    parser.add_argument("--input", help="input file path", type=str)
+    parser.add_argument("--input", help="input file path", type=str, required=True)
     parser.add_argument(
         "--lang",
         help="language",
@@ -39,15 +37,13 @@
     )
     return parser.parse_args()
 
-
 if __name__ == "__main__":
     # Example usage:
     # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter
     args = parse_args()
     if args.lang == "en":
-        from fun_text_processing.inverse_text_normalization.en.clean_eval_data import (
-            filter_loaded_data,
-        )
+        from fun_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data
+
     file_path = args.input
     inverse_normalizer = InverseNormalizer()
 
@@ -57,6 +53,7 @@
     if args.filter:
         training_data = filter_loaded_data(training_data)
 
+    # Evaluate at sentence level if no specific category is provided
     if args.category is None:
         print("Sentence level evaluation...")
         sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data)
@@ -68,12 +65,12 @@
         )
         print("- Accuracy: " + str(sentences_accuracy))
 
+    # Evaluate at token level
     print("Token level evaluation...")
     tokens_per_type = training_data_to_tokens(training_data, category=args.category)
     token_accuracy = {}
-    for token_type in tokens_per_type:
+    for token_type, (tokens_un_normalized, tokens_normalized) in tokens_per_type.items():
         print("- Token type: " + token_type)
-        tokens_un_normalized, tokens_normalized = tokens_per_type[token_type]
         print("  - Data: " + str(len(tokens_normalized)) + " tokens")
         tokens_prediction = inverse_normalizer.inverse_normalize_list(tokens_normalized)
         print("  - Denormalized. Evaluating...")
@@ -81,9 +78,9 @@
             tokens_prediction, tokens_un_normalized, input=tokens_normalized
         )
         print("  - Accuracy: " + str(token_accuracy[token_type]))
-    token_count_per_type = {
-        token_type: len(tokens_per_type[token_type][0]) for token_type in tokens_per_type
-    }
+
+    # Calculate weighted token accuracy
+    token_count_per_type = {token_type: len(tokens) for token_type, (tokens, _) in tokens_per_type.items()}
     token_weighted_accuracy = [
         token_count_per_type[token_type] * accuracy
         for token_type, accuracy in token_accuracy.items()
@@ -96,19 +93,17 @@
         if token_type not in known_types:
             raise ValueError("Unexpected token type: " + token_type)
 
+    # Output table summarizing evaluation results if no specific category is provided
     if args.category is None:
         c1 = ["Class", "sent level"] + known_types
         c2 = ["Num Tokens", len(sentences_normalized)] + [
-            token_count_per_type[known_type] if known_type in tokens_per_type else "0"
-            for known_type in known_types
+            str(token_count_per_type.get(known_type, 0)) for known_type in known_types
         ]
-        c3 = ["Denormalization", sentences_accuracy] + [
-            token_accuracy[known_type] if known_type in token_accuracy else "0"
-            for known_type in known_types
+        c3 = ["Denormalization", str(sentences_accuracy)] + [
+            str(token_accuracy.get(known_type, "0")) for known_type in known_types
         ]
-
         for i in range(len(c1)):
-            print(f"{str(c1[i]):10s} | {str(c2[i]):10s} | {str(c3[i]):5s}")
+            print(f"{c1[i]:10s} | {c2[i]:10s} | {c3[i]:5s}")
     else:
         print(f"numbers\t{token_count_per_type[args.category]}")
         print(f"Denormalization\t{token_accuracy[args.category]}")
diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py
index 04689be..bfffca8 100644
--- a/funasr/models/fsmn_vad_streaming/model.py
+++ b/funasr/models/fsmn_vad_streaming/model.py
@@ -8,6 +8,7 @@
 import time
 import math
 import torch
+import numpy as np
 from torch import nn
 from enum import Enum
 from dataclasses import dataclass
@@ -334,18 +335,17 @@
             cache["stats"].data_buf_all = torch.cat(
                 (cache["stats"].data_buf_all, cache["stats"].waveform[0])
             )
-        for offset in range(
-            0, cache["stats"].waveform.shape[1] - frame_sample_length + 1, frame_shift_length
-        ):
-            cache["stats"].decibel.append(
-                10
-                * math.log10(
-                    (cache["stats"].waveform[0][offset : offset + frame_sample_length])
-                    .square()
-                    .sum()
-                    + 0.000001
-                )
-            )
+            
+        waveform_numpy = cache["stats"].waveform.numpy()
+
+        offsets = np.arange(0, waveform_numpy.shape[1] - frame_sample_length + 1, frame_shift_length)
+        frames = waveform_numpy[0, offsets[:, np.newaxis] + np.arange(frame_sample_length)]
+
+        decibel_numpy = 10 * np.log10(np.sum(np.square(frames), axis=1) + 0.000001)
+        decibel_numpy = decibel_numpy.tolist()
+
+        cache["stats"].decibel.extend(decibel_numpy)
+
 
     def ComputeScores(self, feats: torch.Tensor, cache: dict = {}) -> None:
         scores = self.encoder(feats, cache=cache["encoder"]).to("cpu")  # return B * T * D
@@ -406,7 +406,6 @@
         cur_seg = cache["stats"].output_data_buf[-1]
         if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
             print("warning\n")
-        out_pos = len(cur_seg.buffer)  # cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
         data_to_pop = 0
         if end_point_is_sent_end:
             data_to_pop = expected_sample_number
@@ -420,12 +419,6 @@
             expected_sample_number = len(cache["stats"].data_buf)
 
         cur_seg.doa = 0
-        for sample_cpy_out in range(0, data_to_pop):
-            # cur_seg.buffer[out_pos ++] = data_buf_.back();
-            out_pos += 1
-        for sample_cpy_out in range(data_to_pop, expected_sample_number):
-            # cur_seg.buffer[out_pos++] = data_buf_.back()
-            out_pos += 1
         if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
             print("Something wrong with the VAD algorithm\n")
         cache["stats"].data_buf_start_frame += frm_cnt
@@ -512,10 +505,17 @@
         assert len(cache["stats"].sil_pdf_ids) == self.vad_opts.silence_pdf_num
         if len(cache["stats"].sil_pdf_ids) > 0:
             assert len(cache["stats"].scores) == 1  # 鍙敮鎸乥atch_size = 1鐨勬祴璇�
-            sil_pdf_scores = [
-                cache["stats"].scores[0][t][sil_pdf_id] for sil_pdf_id in cache["stats"].sil_pdf_ids
-            ]
-            sum_score = sum(sil_pdf_scores)
+            """
+            - Change type of `sum_score` to float. The reason is that `sum_score` is a tensor with single element.
+              and `torch.Tensor` is slower `float` when tensor has only one element.
+            - Put the iteration of `sil_pdf_ids` inside `sum()` to reduce the overhead of creating a new list.
+            - The default `sil_pdf_ids` is [0], the `if` statement is used to reduce the overhead of expression
+              generation, which result in a mere (~2%) performance gain.
+            """
+            if len(cache["stats"].sil_pdf_ids) > 1:
+                sum_score = sum(cache["stats"].scores[0][t][sil_pdf_id].item() for sil_pdf_id in cache["stats"].sil_pdf_ids)
+            else:
+                sum_score = cache["stats"].scores[0][t][cache["stats"].sil_pdf_ids[0]].item()
             noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
             total_score = 1.0
             sum_score = total_score - sum_score
diff --git a/runtime/docs/SDK_advanced_guide_online.md b/runtime/docs/SDK_advanced_guide_online.md
index e9a9592..4cf87ab 100644
--- a/runtime/docs/SDK_advanced_guide_online.md
+++ b/runtime/docs/SDK_advanced_guide_online.md
@@ -8,6 +8,7 @@
 
 | TIME       | INFO                                                                                | IMAGE VERSION                       | IMAGE ID     |
 |------------|-------------------------------------------------------------------------------------|-------------------------------------|--------------|
+| 2024.10.29 | The 2pass-offline mode supports the SensevoiceSmal model | funasr-runtime-sdk-online-cpu-0.1.12 | f5febc5cf13a |
 | 2024.09.26 | Fix memory leak | funasr-runtime-sdk-online-cpu-0.1.11 | e51a36c42771 |
 | 2024.05.15 | Adapting to FunASR 1.0 model structure | funasr-runtime-sdk-online-cpu-0.1.10 | 1c2adfcff84d |
 | 2024.03.05 | docker image supports ARM64 platform, update modelscope | funasr-runtime-sdk-online-cpu-0.1.9 | 4a875e08c7a2 |
@@ -31,9 +32,9 @@
 ### Pull Docker Image
 Use the following command to pull and start the FunASR software package docker image:
 ```shell
-sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.10
+sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12
 mkdir -p ./funasr-runtime-resources/models
-sudo docker run -p 10096:10095 -it --privileged=true -v $PWD/funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.10
+sudo docker run -p 10096:10095 -it --privileged=true -v $PWD/funasr-runtime-resources/models:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12
 ```
 
 ### Launching the Server
diff --git a/runtime/docs/SDK_advanced_guide_online_zh.md b/runtime/docs/SDK_advanced_guide_online_zh.md
index 8c17caa..247667e 100644
--- a/runtime/docs/SDK_advanced_guide_online_zh.md
+++ b/runtime/docs/SDK_advanced_guide_online_zh.md
@@ -12,6 +12,7 @@
 
 | 鏃堕棿         | 璇︽儏                                | 闀滃儚鐗堟湰                                 | 闀滃儚ID         |
 |:-----------|:----------------------------------|--------------------------------------|--------------|
+| 2024.10.29 | 2pass-offline妯″紡鏀寔SensevoiceSmall妯″瀷 | funasr-runtime-sdk-online-cpu-0.1.12 | f5febc5cf13a |
 | 2024.09.26 | 淇鍐呭瓨娉勬紡 | funasr-runtime-sdk-online-cpu-0.1.11 | e51a36c42771 |
 | 2024.05.15 | 閫傞厤FunASR 1.0妯″瀷缁撴瀯 | funasr-runtime-sdk-online-cpu-0.1.10 | 1c2adfcff84d |
 | 2024.03.05 | docker闀滃儚鏀寔arm64骞冲彴锛屽崌绾odelscope鐗堟湰 | funasr-runtime-sdk-online-cpu-0.1.9 | 4a875e08c7a2 |
@@ -40,11 +41,11 @@
 
 ```shell
 sudo docker pull \
-  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.11
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12
 mkdir -p ./funasr-runtime-resources/models
 sudo docker run -p 10096:10095 -it --privileged=true \
   -v $PWD/funasr-runtime-resources/models:/workspace/models \
-  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.11
+  registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-online-cpu-0.1.12
 ```
 
 ### 鏈嶅姟绔惎鍔�
@@ -63,11 +64,13 @@
   --hotword /workspace/models/hotwords.txt > log.txt 2>&1 &
 
 # 濡傛灉鎮ㄦ兂鍏抽棴ssl锛屽鍔犲弬鏁帮細--certfile 0
-# 濡傛灉鎮ㄦ兂浣跨敤鏃堕棿鎴虫垨鑰卬n鐑瘝妯″瀷杩涜閮ㄧ讲锛岃璁剧疆--model-dir涓哄搴旀ā鍨嬶細
+# 濡傛灉鎮ㄦ兂浣跨敤SenseVoiceSmall妯″瀷銆佹椂闂存埑銆乶n鐑瘝妯″瀷杩涜閮ㄧ讲锛岃璁剧疆--model-dir涓哄搴旀ā鍨嬶細
+#   iic/SenseVoiceSmall-onnx
 #   damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-onnx锛堟椂闂存埑锛�
 #   damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404-onnx锛坣n鐑瘝锛�
 # 濡傛灉鎮ㄦ兂鍦ㄦ湇鍔＄鍔犺浇鐑瘝锛岃鍦ㄥ涓绘満鏂囦欢./funasr-runtime-resources/models/hotwords.txt閰嶇疆鐑瘝锛坉ocker鏄犲皠鍦板潃涓�/workspace/models/hotwords.txt锛�:
 #   姣忚涓�涓儹璇嶏紝鏍煎紡(鐑瘝 鏉冮噸)锛氶樋閲屽反宸� 20锛堟敞锛氱儹璇嶇悊璁轰笂鏃犻檺鍒讹紝浣嗕负浜嗗吋椤炬�ц兘鍜屾晥鏋滐紝寤鸿鐑瘝闀垮害涓嶈秴杩�10锛屼釜鏁颁笉瓒呰繃1k锛屾潈閲�1~100锛�
+# SenseVoiceSmall-onnx璇嗗埆缁撴灉涓��<|zh|><|NEUTRAL|><|Speech|> 鈥濆垎鍒负瀵瑰簲鐨勮绉嶃�佹儏鎰熴�佷簨浠朵俊鎭�
 ```
 鏈嶅姟绔缁嗗弬鏁颁粙缁嶅彲鍙傝�僛鏈嶅姟绔敤娉曡瑙(#鏈嶅姟绔敤娉曡瑙�)
 ### 瀹㈡埛绔祴璇曚笌浣跨敤
diff --git a/runtime/onnxruntime/include/funasrruntime.h b/runtime/onnxruntime/include/funasrruntime.h
index 5dedaf7..685c024 100644
--- a/runtime/onnxruntime/include/funasrruntime.h
+++ b/runtime/onnxruntime/include/funasrruntime.h
@@ -120,7 +120,8 @@
 _FUNASRAPI FUNASR_RESULT	FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf, 
 												int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished=true, 
 												int sampling_rate=16000, std::string wav_format="pcm", ASR_TYPE mode=ASR_TWO_PASS, 
-												const std::vector<std::vector<float>> &hw_emb={{0.0}}, bool itn=true, FUNASR_DEC_HANDLE dec_handle=nullptr);
+												const std::vector<std::vector<float>> &hw_emb={{0.0}}, bool itn=true, FUNASR_DEC_HANDLE dec_handle=nullptr,
+												std::string svs_lang="auto", bool svs_itn=true);
 _FUNASRAPI void				FunTpassUninit(FUNASR_HANDLE handle);
 _FUNASRAPI void				FunTpassOnlineUninit(FUNASR_HANDLE handle);
 
diff --git a/runtime/onnxruntime/include/model.h b/runtime/onnxruntime/include/model.h
index a49baeb..5ce1148 100644
--- a/runtime/onnxruntime/include/model.h
+++ b/runtime/onnxruntime/include/model.h
@@ -16,9 +16,11 @@
     virtual void StartUtterance() = 0;
     virtual void EndUtterance() = 0;
     virtual void Reset() = 0;
+    virtual string GreedySearch(float* in, int n_len, int64_t token_nums, bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0}){return "";};
     virtual void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){};
     virtual void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){};
-    virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){};
+    virtual void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, 
+      const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){};
     virtual void InitLm(const std::string &lm_file, const std::string &lm_config, const std::string &lex_file){};
     virtual void InitFstDecoder(){};
     virtual std::string Forward(float *din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr){return "";};
diff --git a/runtime/onnxruntime/include/tpass-stream.h b/runtime/onnxruntime/include/tpass-stream.h
index 0276631..a4640a2 100644
--- a/runtime/onnxruntime/include/tpass-stream.h
+++ b/runtime/onnxruntime/include/tpass-stream.h
@@ -26,11 +26,13 @@
     bool UseVad(){return use_vad;};
     bool UsePunc(){return use_punc;}; 
     bool UseITN(){return use_itn;};
+    std::string GetModelType(){return model_type;};
     
   private:
     bool use_vad=false;
     bool use_punc=false;
     bool use_itn=false;
+    std::string model_type = MODEL_PARA;
 };
 
 TpassStream *CreateTpassStream(std::map<std::string, std::string>& model_path, int thread_num=1);
diff --git a/runtime/onnxruntime/src/funasrruntime.cpp b/runtime/onnxruntime/src/funasrruntime.cpp
index 88a3970..6286412 100644
--- a/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/runtime/onnxruntime/src/funasrruntime.cpp
@@ -482,7 +482,8 @@
 	_FUNASRAPI FUNASR_RESULT FunTpassInferBuffer(FUNASR_HANDLE handle, FUNASR_HANDLE online_handle, const char* sz_buf, 
 												 int n_len, std::vector<std::vector<std::string>> &punc_cache, bool input_finished, 
 												 int sampling_rate, std::string wav_format, ASR_TYPE mode, 
-												 const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle)
+												 const std::vector<std::vector<float>> &hw_emb, bool itn, FUNASR_DEC_HANDLE dec_handle,
+												 std::string svs_lang, bool svs_itn)
 	{
 		funasr::TpassStream* tpass_stream = (funasr::TpassStream*)handle;
 		funasr::TpassOnlineStream* tpass_online_stream = (funasr::TpassOnlineStream*)online_handle;
@@ -525,7 +526,7 @@
 
 		funasr::AudioFrame* frame = nullptr;
 		while(audio->FetchChunck(frame) > 0){
-			string msg = ((funasr::ParaformerOnline*)asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
+			string msg = (asr_online_handle)->Forward(frame->data, frame->len, frame->is_final);
 			if(mode == ASR_ONLINE){
 				((funasr::ParaformerOnline*)asr_online_handle)->online_res += msg;
 				if(frame->is_final){
@@ -567,7 +568,12 @@
         	len = new int[1];
 			buff[0] = frame->data;
 			len[0] = frame->len;
-			vector<string> msgs = ((funasr::Paraformer*)asr_handle)->Forward(buff, len, frame->is_final, hw_emb, dec_handle);
+			vector<string> msgs;
+			if(tpass_stream->GetModelType() == MODEL_SVS){
+				msgs = (tpass_stream->asr_handle)->Forward(buff, len, true, svs_lang, svs_itn, 1);
+			}else{
+				msgs = (tpass_stream->asr_handle)->Forward(buff, len, true, hw_emb, dec_handle, 1);
+			}
 			string msg = msgs.size()>0?msgs[0]:"";
 			std::vector<std::string> msg_vec = funasr::SplitStr(msg, " | ");  // split with timestamp
 			if(msg_vec.size()==0){
@@ -589,24 +595,29 @@
 				p_result->stamp += cur_stamp + "]";
 			}
 
-			string msg_punc = punc_online_handle->AddPunc(msg.c_str(), punc_cache[1]);
-			if(input_finished){
-				msg_punc += "銆�";
-			}
-			p_result->tpass_msg = msg_punc;
-#if !defined(__APPLE__)
-			if(tpass_stream->UseITN() && itn){
-				string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc);
-				// TimestampSmooth
-				if(!(p_result->stamp).empty()){
-					std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp);
-					if(!new_stamp.empty()){
-						p_result->stamp = new_stamp;
-					}
+			if (tpass_stream->GetModelType() == MODEL_PARA){
+				string msg_punc = punc_online_handle->AddPunc(msg.c_str(), punc_cache[1]);
+				if(input_finished){
+					msg_punc += "銆�";
 				}
-				p_result->tpass_msg = msg_itn;
-			}
+				p_result->tpass_msg = msg_punc;
+
+#if !defined(__APPLE__)
+				if(tpass_stream->UseITN() && itn){
+					string msg_itn = tpass_stream->itn_handle->Normalize(msg_punc);
+					// TimestampSmooth
+					if(!(p_result->stamp).empty()){
+						std::string new_stamp = funasr::TimestampSmooth(p_result->tpass_msg, msg_itn, p_result->stamp);
+						if(!new_stamp.empty()){
+							p_result->stamp = new_stamp;
+						}
+					}
+					p_result->tpass_msg = msg_itn;
+				}
 #endif
+			}else{
+				p_result->tpass_msg = msg;
+			}
 			if (!(p_result->stamp).empty()){
 				p_result->stamp_sents = funasr::TimestampSentence(p_result->tpass_msg, p_result->stamp);
 			}
diff --git a/runtime/onnxruntime/src/paraformer-online.cpp b/runtime/onnxruntime/src/paraformer-online.cpp
index 55a4fd1..88951aa 100644
--- a/runtime/onnxruntime/src/paraformer-online.cpp
+++ b/runtime/onnxruntime/src/paraformer-online.cpp
@@ -9,18 +9,55 @@
 
 namespace funasr {
 
-ParaformerOnline::ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size)
-:para_handle_(std::move(para_handle)),chunk_size(chunk_size),session_options_{}{
-    InitOnline(
-        para_handle_->fbank_opts_,
-        para_handle_->encoder_session_,
-        para_handle_->decoder_session_,
-        para_handle_->en_szInputNames_,
-        para_handle_->en_szOutputNames_,
-        para_handle_->de_szInputNames_,
-        para_handle_->de_szOutputNames_,
-        para_handle_->means_list_,
-        para_handle_->vars_list_);
+ParaformerOnline::ParaformerOnline(Model* offline_handle, std::vector<int> chunk_size, std::string model_type)
+:offline_handle_(std::move(offline_handle)),chunk_size(chunk_size),session_options_{}{
+    if(model_type == MODEL_PARA){
+        Paraformer* para_handle = dynamic_cast<Paraformer*>(offline_handle_);
+        InitOnline(
+        para_handle->fbank_opts_,
+        para_handle->encoder_session_,
+        para_handle->decoder_session_,
+        para_handle->en_szInputNames_,
+        para_handle->en_szOutputNames_,
+        para_handle->de_szInputNames_,
+        para_handle->de_szOutputNames_,
+        para_handle->means_list_,
+        para_handle->vars_list_,
+        para_handle->frame_length,
+        para_handle->frame_shift,
+        para_handle->n_mels,
+        para_handle->lfr_m,
+        para_handle->lfr_n,
+        para_handle->encoder_size,
+        para_handle->fsmn_layers,
+        para_handle->fsmn_lorder,
+        para_handle->fsmn_dims,
+        para_handle->cif_threshold,
+        para_handle->tail_alphas);
+    }else if(model_type == MODEL_SVS){
+        SenseVoiceSmall* svs_handle = dynamic_cast<SenseVoiceSmall*>(offline_handle_);
+        InitOnline(
+        svs_handle->fbank_opts_,
+        svs_handle->encoder_session_,
+        svs_handle->decoder_session_,
+        svs_handle->en_szInputNames_,
+        svs_handle->en_szOutputNames_,
+        svs_handle->de_szInputNames_,
+        svs_handle->de_szOutputNames_,
+        svs_handle->means_list_,
+        svs_handle->vars_list_,
+        svs_handle->frame_length,
+        svs_handle->frame_shift,
+        svs_handle->n_mels,
+        svs_handle->lfr_m,
+        svs_handle->lfr_n,
+        svs_handle->encoder_size,
+        svs_handle->fsmn_layers,
+        svs_handle->fsmn_lorder,
+        svs_handle->fsmn_dims,
+        svs_handle->cif_threshold,
+        svs_handle->tail_alphas);
+    }
     InitCache();
 }
 
@@ -33,7 +70,18 @@
         vector<const char*> &de_szInputNames,
         vector<const char*> &de_szOutputNames,
         vector<float> &means_list,
-        vector<float> &vars_list){
+        vector<float> &vars_list,
+        int frame_length_,
+        int frame_shift_,
+        int n_mels_,
+        int lfr_m_,
+        int lfr_n_,
+        int encoder_size_,
+        int fsmn_layers_,
+        int fsmn_lorder_,
+        int fsmn_dims_,
+        float cif_threshold_,
+        float tail_alphas_){
     fbank_opts_ = fbank_opts;
     encoder_session_ = encoder_session;
     decoder_session_ = decoder_session;
@@ -44,27 +92,27 @@
     means_list_ = means_list;
     vars_list_ = vars_list;
 
-    frame_length = para_handle_->frame_length;
-    frame_shift = para_handle_->frame_shift;
-    n_mels = para_handle_->n_mels;
-    lfr_m = para_handle_->lfr_m;
-    lfr_n = para_handle_->lfr_n;
-    encoder_size = para_handle_->encoder_size;
-    fsmn_layers = para_handle_->fsmn_layers;
-    fsmn_lorder = para_handle_->fsmn_lorder;
-    fsmn_dims = para_handle_->fsmn_dims;
-    cif_threshold = para_handle_->cif_threshold;
-    tail_alphas = para_handle_->tail_alphas;
+    frame_length = frame_length_;
+    frame_shift = frame_shift_;
+    n_mels = n_mels_;
+    lfr_m = lfr_m_;
+    lfr_n = lfr_n_;
+    encoder_size = encoder_size_;
+    fsmn_layers = fsmn_layers_;
+    fsmn_lorder = fsmn_lorder_;
+    fsmn_dims = fsmn_dims_;
+    cif_threshold = cif_threshold_;
+    tail_alphas = tail_alphas_;
 
     // other vars
     sqrt_factor = std::sqrt(encoder_size);
     for(int i=0; i<fsmn_lorder*fsmn_dims; i++){
         fsmn_init_cache_.emplace_back(0);
     }
-    chunk_len = chunk_size[1]*frame_shift*lfr_n*para_handle_->asr_sample_rate/1000;
+    chunk_len = chunk_size[1]*frame_shift*lfr_n*offline_handle_->GetAsrSampleRate()/1000;
 
-    frame_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_length;
-    frame_shift_sample_length_ = para_handle_->asr_sample_rate / 1000 * frame_shift;
+    frame_sample_length_ = offline_handle_->GetAsrSampleRate() / 1000 * frame_length;
+    frame_shift_sample_length_ = offline_handle_->GetAsrSampleRate() / 1000 * frame_shift;
 
 }
 
@@ -464,7 +512,7 @@
 
             std::vector<int64_t> decoder_shape = decoder_tensor[0].GetTensorTypeAndShapeInfo().GetShape();
             float* float_data = decoder_tensor[0].GetTensorMutableData<float>();
-            result = para_handle_->GreedySearch(float_data, list_frame.size(), decoder_shape[2]);
+            result = offline_handle_->GreedySearch(float_data, list_frame.size(), decoder_shape[2]);
         }
     }catch (std::exception const &e)
     {
@@ -493,7 +541,7 @@
         if(is_first_chunk){
             is_first_chunk = false;
         }
-        ExtractFeats(para_handle_->asr_sample_rate, wav_feats, waves, input_finished);
+        ExtractFeats(offline_handle_->GetAsrSampleRate(), wav_feats, waves, input_finished);
         if(wav_feats.size() == 0){
             return result;
         }
diff --git a/runtime/onnxruntime/src/paraformer-online.h b/runtime/onnxruntime/src/paraformer-online.h
index 8c9bb88..8ab473d 100644
--- a/runtime/onnxruntime/src/paraformer-online.h
+++ b/runtime/onnxruntime/src/paraformer-online.h
@@ -38,7 +38,18 @@
             vector<const char*> &de_szInputNames,
             vector<const char*> &de_szOutputNames,
             vector<float> &means_list,
-            vector<float> &vars_list);
+            vector<float> &vars_list,
+            int frame_length_,
+            int frame_shift_,
+            int n_mels_,
+            int lfr_m_,
+            int lfr_n_,
+            int encoder_size_,
+            int fsmn_layers_,
+            int fsmn_lorder_,
+            int fsmn_dims_,
+            float cif_threshold_,
+            float tail_alphas_);
 
         void StartUtterance()
         {
@@ -48,8 +59,8 @@
         {
         }
         
-        Paraformer* para_handle_ = nullptr;
-        // from para_handle_
+        Model* offline_handle_ = nullptr;
+        // from offline_handle_
         knf::FbankOptions fbank_opts_;
         std::shared_ptr<Ort::Session> encoder_session_ = nullptr;
         std::shared_ptr<Ort::Session> decoder_session_ = nullptr;
@@ -60,7 +71,7 @@
         vector<const char*> de_szOutputNames_;
         vector<float> means_list_;
         vector<float> vars_list_;
-        // configs from para_handle_
+        // configs from offline_handle_
         int frame_length = 25;
         int frame_shift = 10;
         int n_mels = 80;
@@ -100,7 +111,7 @@
         double sqrt_factor;
 
     public:
-        ParaformerOnline(Paraformer* para_handle, std::vector<int> chunk_size);
+        ParaformerOnline(Model* offline_handle, std::vector<int> chunk_size, std::string model_type=MODEL_PARA);
         ~ParaformerOnline();
         void Reset();
         void ResetCache();
@@ -112,7 +123,7 @@
         string Forward(float* din, int len, bool input_finished, const std::vector<std::vector<float>> &hw_emb={{0.0}}, void* wfst_decoder=nullptr);
         string Rescoring();
 
-        int GetAsrSampleRate() { return para_handle_->asr_sample_rate; };
+        int GetAsrSampleRate() { return offline_handle_->GetAsrSampleRate(); };
 
         // 2pass
         std::string online_res;
diff --git a/runtime/onnxruntime/src/paraformer.cpp b/runtime/onnxruntime/src/paraformer.cpp
index 24f5152..7e1fe96 100644
--- a/runtime/onnxruntime/src/paraformer.cpp
+++ b/runtime/onnxruntime/src/paraformer.cpp
@@ -131,9 +131,10 @@
 }
 
 // 2pass
-void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
+void Paraformer::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, 
+    const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){
     // online
-    InitAsr(en_model, de_model, am_cmvn, am_config, token_file, thread_num);
+    InitAsr(en_model, de_model, am_cmvn, am_config, online_token_file, thread_num);
 
     // offline
     try {
@@ -144,28 +145,6 @@
         exit(-1);
     }
 
-    // string strName;
-    // GetInputName(m_session_.get(), strName);
-    // m_strInputNames.push_back(strName.c_str());
-    // GetInputName(m_session_.get(), strName,1);
-    // m_strInputNames.push_back(strName);
-
-    // if (use_hotword) {
-    //     GetInputName(m_session_.get(), strName, 2);
-    //     m_strInputNames.push_back(strName);
-    // }
-    
-    // // support time stamp
-    // size_t numOutputNodes = m_session_->GetOutputCount();
-    // for(int index=0; index<numOutputNodes; index++){
-    //     GetOutputName(m_session_.get(), strName, index);
-    //     m_strOutputNames.push_back(strName);
-    // }
-
-    // for (auto& item : m_strInputNames)
-    //     m_szInputNames.push_back(item.c_str());
-    // for (auto& item : m_strOutputNames)
-    //     m_szOutputNames.push_back(item.c_str());
     GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames);
     GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames);
 }
diff --git a/runtime/onnxruntime/src/paraformer.h b/runtime/onnxruntime/src/paraformer.h
index 571b2ba..41e71f5 100644
--- a/runtime/onnxruntime/src/paraformer.h
+++ b/runtime/onnxruntime/src/paraformer.h
@@ -46,7 +46,8 @@
         // online
         void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
         // 2pass
-        void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
+        void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, 
+            const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num);
         void InitHwCompiler(const std::string &hw_model, int thread_num);
         void InitSegDict(const std::string &seg_dict_model);
         std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords);
diff --git a/runtime/onnxruntime/src/sensevoice-small.cpp b/runtime/onnxruntime/src/sensevoice-small.cpp
index 9fa72a0..5cb1042 100644
--- a/runtime/onnxruntime/src/sensevoice-small.cpp
+++ b/runtime/onnxruntime/src/sensevoice-small.cpp
@@ -48,6 +48,145 @@
     LoadCmvn(am_cmvn.c_str());
 }
 
+// online
+void SenseVoiceSmall::InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num){
+    
+    LoadOnlineConfigFromYaml(am_config.c_str());
+    // knf options
+    fbank_opts_.frame_opts.dither = 0;
+    fbank_opts_.mel_opts.num_bins = n_mels;
+    fbank_opts_.frame_opts.samp_freq = asr_sample_rate;
+    fbank_opts_.frame_opts.window_type = window_type;
+    fbank_opts_.frame_opts.frame_shift_ms = frame_shift;
+    fbank_opts_.frame_opts.frame_length_ms = frame_length;
+    fbank_opts_.energy_floor = 0;
+    fbank_opts_.mel_opts.debug_mel = false;
+
+    // session_options_.SetInterOpNumThreads(1);
+    session_options_.SetIntraOpNumThreads(thread_num);
+    session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
+    // DisableCpuMemArena can improve performance
+    session_options_.DisableCpuMemArena();
+
+    try {
+        encoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(en_model).c_str(), session_options_);
+        LOG(INFO) << "Successfully load model from " << en_model;
+    } catch (std::exception const &e) {
+        LOG(ERROR) << "Error when load am encoder model: " << e.what();
+        exit(-1);
+    }
+
+    try {
+        decoder_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(de_model).c_str(), session_options_);
+        LOG(INFO) << "Successfully load model from " << de_model;
+    } catch (std::exception const &e) {
+        LOG(ERROR) << "Error when load am decoder model: " << e.what();
+        exit(-1);
+    }
+
+    // encoder
+    string strName;
+    GetInputName(encoder_session_.get(), strName);
+    en_strInputNames.push_back(strName.c_str());
+    GetInputName(encoder_session_.get(), strName,1);
+    en_strInputNames.push_back(strName);
+    
+    GetOutputName(encoder_session_.get(), strName);
+    en_strOutputNames.push_back(strName);
+    GetOutputName(encoder_session_.get(), strName,1);
+    en_strOutputNames.push_back(strName);
+    GetOutputName(encoder_session_.get(), strName,2);
+    en_strOutputNames.push_back(strName);
+
+    for (auto& item : en_strInputNames)
+        en_szInputNames_.push_back(item.c_str());
+    for (auto& item : en_strOutputNames)
+        en_szOutputNames_.push_back(item.c_str());
+
+    // decoder
+    int de_input_len = 4 + fsmn_layers;
+    int de_out_len = 2 + fsmn_layers;
+    for(int i=0;i<de_input_len; i++){
+        GetInputName(decoder_session_.get(), strName, i);
+        de_strInputNames.push_back(strName.c_str());
+    }
+
+    for(int i=0;i<de_out_len; i++){
+        GetOutputName(decoder_session_.get(), strName,i);
+        de_strOutputNames.push_back(strName);
+    }
+
+    for (auto& item : de_strInputNames)
+        de_szInputNames_.push_back(item.c_str());
+    for (auto& item : de_strOutputNames)
+        de_szOutputNames_.push_back(item.c_str());
+
+    online_vocab = new Vocab(token_file.c_str());
+    phone_set_ = new PhoneSet(token_file.c_str());
+    LoadCmvn(am_cmvn.c_str());
+}
+
+// 2pass
+void SenseVoiceSmall::InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, 
+    const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, const std::string &online_token_file, int thread_num){
+    // online
+    InitAsr(en_model, de_model, am_cmvn, am_config, online_token_file, thread_num);
+
+    // offline
+    try {
+        m_session_ = std::make_unique<Ort::Session>(env_, ORTSTRING(am_model).c_str(), session_options_);
+        LOG(INFO) << "Successfully load model from " << am_model;
+    } catch (std::exception const &e) {
+        LOG(ERROR) << "Error when load am onnx model: " << e.what();
+        exit(-1);
+    }
+
+    GetInputNames(m_session_.get(), m_strInputNames, m_szInputNames);
+    GetOutputNames(m_session_.get(), m_strOutputNames, m_szOutputNames);
+    vocab = new Vocab(token_file.c_str());
+}
+
+void SenseVoiceSmall::LoadOnlineConfigFromYaml(const char* filename){
+
+    YAML::Node config;
+    try{
+        config = YAML::LoadFile(filename);
+    }catch(exception const &e){
+        LOG(ERROR) << "Error loading file, yaml file error or not exist.";
+        exit(-1);
+    }
+
+    try{
+        YAML::Node frontend_conf = config["frontend_conf"];
+        YAML::Node encoder_conf = config["encoder_conf"];
+        YAML::Node decoder_conf = config["decoder_conf"];
+        YAML::Node predictor_conf = config["predictor_conf"];
+
+        this->window_type = frontend_conf["window"].as<string>();
+        this->n_mels = frontend_conf["n_mels"].as<int>();
+        this->frame_length = frontend_conf["frame_length"].as<int>();
+        this->frame_shift = frontend_conf["frame_shift"].as<int>();
+        this->lfr_m = frontend_conf["lfr_m"].as<int>();
+        this->lfr_n = frontend_conf["lfr_n"].as<int>();
+
+        this->encoder_size = encoder_conf["output_size"].as<int>();
+        this->fsmn_dims = encoder_conf["output_size"].as<int>();
+
+        this->fsmn_layers = decoder_conf["num_blocks"].as<int>();
+        this->fsmn_lorder = decoder_conf["kernel_size"].as<int>()-1;
+
+        this->cif_threshold = predictor_conf["threshold"].as<double>();
+        this->tail_alphas = predictor_conf["tail_threshold"].as<double>();
+
+        this->asr_sample_rate = frontend_conf["fs"].as<int>();
+
+
+    }catch(exception const &e){
+        LOG(ERROR) << "Error when load argument from vad config YAML.";
+        exit(-1);
+    }
+}
+
 void SenseVoiceSmall::LoadConfigFromYaml(const char* filename){
 
     YAML::Node config;
@@ -83,6 +222,9 @@
 {
     if(vocab){
         delete vocab;
+    }
+    if(online_vocab){
+        delete online_vocab;
     }
     if(lm_vocab){
         delete lm_vocab;
@@ -212,6 +354,30 @@
     return str_lang + str_emo + str_event + " " + text;
 }
 
+string SenseVoiceSmall::GreedySearch(float * in, int n_len,  int64_t token_nums, bool is_stamp, std::vector<float> us_alphas, std::vector<float> us_cif_peak)
+{
+    vector<int> hyps;
+    int Tmax = n_len;
+    for (int i = 0; i < Tmax; i++) {
+        int max_idx;
+        float max_val;
+        FindMax(in + i * token_nums, token_nums, max_val, max_idx);
+        hyps.push_back(max_idx);
+    }
+    if(!is_stamp){
+        return online_vocab->Vector2StringV2(hyps, language);
+    }else{
+        std::vector<string> char_list;
+        std::vector<std::vector<float>> timestamp_list;
+        std::string res_str;
+        online_vocab->Vector2String(hyps, char_list);
+        std::vector<string> raw_char(char_list);
+        TimestampOnnx(us_alphas, us_cif_peak, char_list, res_str, timestamp_list);
+
+        return PostProcess(raw_char, timestamp_list);
+    }
+}
+
 void SenseVoiceSmall::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
 
     std::vector<std::vector<float>> out_feats;
diff --git a/runtime/onnxruntime/src/sensevoice-small.h b/runtime/onnxruntime/src/sensevoice-small.h
index f987f38..75cbc92 100644
--- a/runtime/onnxruntime/src/sensevoice-small.h
+++ b/runtime/onnxruntime/src/sensevoice-small.h
@@ -12,12 +12,14 @@
     class SenseVoiceSmall : public Model {
     private:
         Vocab* vocab = nullptr;
+        Vocab* online_vocab = nullptr;
         Vocab* lm_vocab = nullptr;
         SegDict* seg_dict = nullptr;
         PhoneSet* phone_set_ = nullptr;
         const float scale = 1.0;
 
         void LoadConfigFromYaml(const char* filename);
+        void LoadOnlineConfigFromYaml(const char* filename);
         void LoadCmvn(const char *filename);
         void LfrCmvn(std::vector<std::vector<float>> &asr_feats);
 
@@ -34,9 +36,10 @@
         ~SenseVoiceSmall();
         void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
         // online
-        // void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
+        void InitAsr(const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
         // 2pass
-        // void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, const std::string &token_file, int thread_num);
+        void InitAsr(const std::string &am_model, const std::string &en_model, const std::string &de_model, const std::string &am_cmvn, const std::string &am_config, 
+            const std::string &token_file, const std::string &online_token_file, int thread_num);
         // void InitHwCompiler(const std::string &hw_model, int thread_num);
         // void InitSegDict(const std::string &seg_dict_model);
         std::vector<std::vector<float>> CompileHotwordEmbedding(std::string &hotwords);
@@ -44,7 +47,8 @@
         void FbankKaldi(float sample_rate, const float* waves, int len, std::vector<std::vector<float>> &asr_feats);
         std::vector<std::string> Forward(float** din, int* len, bool input_finished=true, std::string svs_lang="auto", bool svs_itn=true, int batch_in=1);
         string CTCSearch( float * in, std::vector<int32_t> paraformer_length, std::vector<int64_t> outputShape);
-
+        string GreedySearch( float* in, int n_len, int64_t token_nums,
+                             bool is_stamp=false, std::vector<float> us_alphas={0}, std::vector<float> us_cif_peak={0});
         string Rescoring();
         string GetLang(){return language;};
         int GetAsrSampleRate() { return asr_sample_rate; };
@@ -100,6 +104,8 @@
         int asr_sample_rate = MODEL_SAMPLE_RATE;
         int batch_size_ = 1;
         int blank_id = 0;
+        float cif_threshold = 1.0;
+        float tail_alphas = 0.45;
         //dict
         std::map<std::string, int> lid_map = {
             {"auto", 0},
diff --git a/runtime/onnxruntime/src/tpass-online-stream.cpp b/runtime/onnxruntime/src/tpass-online-stream.cpp
index 7788e0b..338bb2b 100644
--- a/runtime/onnxruntime/src/tpass-online-stream.cpp
+++ b/runtime/onnxruntime/src/tpass-online-stream.cpp
@@ -11,7 +11,7 @@
     }
 
     if(tpass_obj->asr_handle){
-        asr_online_handle = make_unique<ParaformerOnline>((Paraformer*)(tpass_obj->asr_handle).get(), chunk_size);
+        asr_online_handle = make_unique<ParaformerOnline>((tpass_obj->asr_handle).get(), chunk_size, tpass_stream->GetModelType());
     }else{
         LOG(ERROR)<<"asr_handle is null";
         exit(-1);
diff --git a/runtime/onnxruntime/src/tpass-stream.cpp b/runtime/onnxruntime/src/tpass-stream.cpp
index 7681a4d..ff502de 100644
--- a/runtime/onnxruntime/src/tpass-stream.cpp
+++ b/runtime/onnxruntime/src/tpass-stream.cpp
@@ -36,10 +36,17 @@
         string am_cmvn_path;
         string am_config_path;
         string token_path;
+        string online_token_path;
         string hw_compile_model_path;
         string seg_dict_path;
         
-        asr_handle = make_unique<Paraformer>();
+        if (model_path.at(MODEL_DIR).find(MODEL_SVS) != std::string::npos)
+        {
+            asr_handle = make_unique<SenseVoiceSmall>();
+            model_type = MODEL_SVS;
+        }else{
+            asr_handle = make_unique<Paraformer>();
+        }
 
         bool enable_hotword = false;
         hw_compile_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_EB_NAME);
@@ -54,6 +61,7 @@
         am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), MODEL_NAME);
         en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), ENCODER_NAME);
         de_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), DECODER_NAME);
+        online_token_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), TOKEN_PATH);
         if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
             am_model_path = PathAppend(model_path.at(OFFLINE_MODEL_DIR), QUANT_MODEL_NAME);
             en_model_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), QUANT_ENCODER_NAME);
@@ -63,7 +71,7 @@
         am_config_path = PathAppend(model_path.at(ONLINE_MODEL_DIR), AM_CONFIG_NAME);
         token_path = PathAppend(model_path.at(MODEL_DIR), TOKEN_PATH);
 
-        asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, thread_num);
+        asr_handle->InitAsr(am_model_path, en_model_path, de_model_path, am_cmvn_path, am_config_path, token_path, online_token_path, thread_num);
     }else{
         LOG(ERROR) <<"Can not find offline-model-dir or online-model-dir";
         exit(-1);
diff --git a/runtime/readme.md b/runtime/readme.md
index 960ebe3..6a4b22e 100644
--- a/runtime/readme.md
+++ b/runtime/readme.md
@@ -59,7 +59,8 @@
 The FunASR real-time speech-to-text service software package not only performs real-time speech-to-text conversion, but also allows high-precision transcription text correction at the end of each sentence and outputs text with punctuation, supporting high-concurrency multiple requests.
 In order to meet the needs of different users for different scenarios, different tutorials are prepared:
 
-### Whats-new
+### Whats-new+
+- 2024/10/29: Real-time Transcription Service 1.12 released锛孴he 2pass-offline mode supports the SensevoiceSmal model, docker image version funasr-runtime-sdk-online-cpu-0.1.12 (f5febc5cf13a)
 - 2024/09/26: Real-time Transcription Service 1.11 released锛孎ix memory leak, docker image version funasr-runtime-sdk-online-cpu-0.1.11 (e51a36c42771)
 - 2024/05/15: Real-time Transcription Service 1.10 released锛宎dapting to FunASR 1.0 model structure, docker image version funasr-runtime-sdk-online-cpu-0.1.10 (1c2adfcff84d)
 - 2024/03/05: Real-time Transcription Service 1.9 released锛宒ocker image supports ARM64 platform, update modelscope, docker image version funasr-runtime-sdk-online-cpu-0.1.9 (4a875e08c7a2)
diff --git a/runtime/readme_cn.md b/runtime/readme_cn.md
index 17c9a1a..122f9a5 100644
--- a/runtime/readme_cn.md
+++ b/runtime/readme_cn.md
@@ -53,6 +53,7 @@
 涓轰簡鏀寔涓嶅悓鐢ㄦ埛鐨勯渶姹傦紝閽堝涓嶅悓鍦烘櫙锛屽噯澶囦簡涓嶅悓鐨勫浘鏂囨暀绋嬶細
 
 ### 鏈�鏂板姩鎬�
+- 2024/10/29:   涓枃瀹炴椂璇煶鍚啓鏈嶅姟 1.12 鍙戝竷锛�2pass-offline妯″紡鏀寔SensevoiceSmall妯″瀷锛宒ocker闀滃儚鐗堟湰funasr-runtime-sdk-online-cpu-0.1.12 (f5febc5cf13a)
 - 2024/09/26:   涓枃瀹炴椂璇煶鍚啓鏈嶅姟 1.11 鍙戝竷锛屼慨澶嶅唴瀛樻硠婕忥紝docker闀滃儚鐗堟湰funasr-runtime-sdk-online-cpu-0.1.11 (e51a36c42771)
 - 2024/05/15:   涓枃瀹炴椂璇煶鍚啓鏈嶅姟 1.10 鍙戝竷锛岄�傞厤FunASR 1.0妯″瀷缁撴瀯锛宒ocker闀滃儚鐗堟湰funasr-runtime-sdk-online-cpu-0.1.10 (1c2adfcff84d)
 - 2024/03/05:   涓枃瀹炴椂璇煶鍚啓鏈嶅姟 1.9 鍙戝竷锛宒ocker闀滃儚鏀寔arm64骞冲彴锛屽崌绾odelscope鐗堟湰锛宒ocker闀滃儚鐗堟湰funasr-runtime-sdk-online-cpu-0.1.9 (4a875e08c7a2)
diff --git a/runtime/websocket/bin/funasr-wss-client-2pass.cpp b/runtime/websocket/bin/funasr-wss-client-2pass.cpp
index 6c3a4dd..e8bbfc1 100644
--- a/runtime/websocket/bin/funasr-wss-client-2pass.cpp
+++ b/runtime/websocket/bin/funasr-wss-client-2pass.cpp
@@ -124,7 +124,7 @@
   void run(const std::string& uri, const std::vector<string>& wav_list,
            const std::vector<string>& wav_ids, int audio_fs, std::string asr_mode,
            std::vector<int> chunk_size, const std::unordered_map<std::string, int>& hws_map,
-           bool is_record=false, int use_itn=1) {
+           bool is_record=false, int use_itn=1, int svs_itn=1) {
     // Create a new connection to the given URI
     websocketpp::lib::error_code ec;
     typename websocketpp::client<T>::connection_ptr con =
@@ -146,9 +146,9 @@
     websocketpp::lib::thread asio_thread(&websocketpp::client<T>::run,
                                          &m_client);
     if(is_record){
-      send_rec_data(asr_mode, chunk_size, hws_map, use_itn);
+      send_rec_data(asr_mode, chunk_size, hws_map, use_itn, svs_itn);
     }else{
-      send_wav_data(wav_list[0], wav_ids[0], audio_fs, asr_mode, chunk_size, hws_map, use_itn);
+      send_wav_data(wav_list[0], wav_ids[0], audio_fs, asr_mode, chunk_size, hws_map, use_itn, svs_itn);
     }
 
     WaitABit();
@@ -185,7 +185,7 @@
   // send wav to server
   void send_wav_data(string wav_path, string wav_id, int audio_fs, std::string asr_mode,
                      std::vector<int> chunk_vector, const std::unordered_map<std::string, int>& hws_map,
-                     int use_itn) {
+                     int use_itn, int svs_itn) {
     uint64_t count = 0;
     std::stringstream val;
 
@@ -241,8 +241,12 @@
     jsonbegin["audio_fs"] = sampling_rate;
     jsonbegin["is_speaking"] = true;
     jsonbegin["itn"] = true;
+    jsonbegin["svs_itn"] = true;
     if(use_itn == 0){
       jsonbegin["itn"] = false;
+    }
+    if(svs_itn == 0){
+        jsonbegin["svs_itn"] = false;
     }
     if(!hws_map.empty()){
         LOG(INFO) << "hotwords: ";
@@ -335,7 +339,7 @@
   }
 
   void send_rec_data(std::string asr_mode, std::vector<int> chunk_vector, 
-                     const std::unordered_map<std::string, int>& hws_map, int use_itn) {
+                     const std::unordered_map<std::string, int>& hws_map, int use_itn, int svs_itn) {
     // first message
     bool wait = false;
     while (1) {
@@ -374,8 +378,12 @@
     jsonbegin["audio_fs"] = sample_rate;
     jsonbegin["is_speaking"] = true;
     jsonbegin["itn"] = true;
+    jsonbegin["svs_itn"] = true;
     if(use_itn == 0){
       jsonbegin["itn"] = false;
+    }
+    if(svs_itn == 0){
+        jsonbegin["svs_itn"] = false;
     }
     if(!hws_map.empty()){
         LOG(INFO) << "hotwords: ";
@@ -513,6 +521,9 @@
       "", "use-itn",
       "use-itn is 1 means use itn, 0 means not use itn", false, 1,
       "int");
+  TCLAP::ValueArg<int> svs_itn_(
+      "", "svs-itn",
+      "svs-itn is 1 means use itn and punc, 0 means not use", false, 1, "int");
   TCLAP::ValueArg<std::string> hotword_("", HOTWORD,
       "the hotword file, one hotword perline, Format: Hotword Weight (could be: 闃块噷宸村反 20)", false, "", "string");
 
@@ -526,6 +537,7 @@
   cmd.add(thread_num_);
   cmd.add(is_ssl_);
   cmd.add(use_itn_);
+  cmd.add(svs_itn_);
   cmd.add(hotword_);
   cmd.parse(argc, argv);
 
@@ -535,6 +547,7 @@
   std::string asr_mode = asr_mode_.getValue();
   std::string chunk_size_str = chunk_size_.getValue();
   int use_itn = use_itn_.getValue();
+  int svs_itn = svs_itn_.getValue();
   // get chunk_size
   std::vector<int> chunk_size;
   std::stringstream ss(chunk_size_str);
@@ -577,11 +590,11 @@
 
         c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1));
 
-        c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn);
+        c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn, svs_itn);
       } else {
         WebsocketClient<websocketpp::config::asio_client> c(is_ssl);
 
-        c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn);
+        c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn, svs_itn);
       }
 
   }else{
@@ -622,17 +635,17 @@
         tmp_wav_ids.emplace_back(wav_ids[wav_i + i]);
 
         client_threads.emplace_back(
-            [uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, is_ssl, hws_map, use_itn]() {
+            [uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, is_ssl, hws_map, use_itn, svs_itn]() {
               if (is_ssl == 1) {
                 WebsocketClient<websocketpp::config::asio_tls_client> c(is_ssl);
 
                 c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1));
 
-                c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn);
+                c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn, svs_itn);
               } else {
                 WebsocketClient<websocketpp::config::asio_client> c(is_ssl);
 
-                c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn);
+                c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn, svs_itn);
               }
             });
       }
diff --git a/runtime/websocket/bin/funasr-wss-server-2pass.cpp b/runtime/websocket/bin/funasr-wss-server-2pass.cpp
index 9c59254..edf614a 100644
--- a/runtime/websocket/bin/funasr-wss-server-2pass.cpp
+++ b/runtime/websocket/bin/funasr-wss-server-2pass.cpp
@@ -276,6 +276,12 @@
             s_itn_path="";
             s_lm_path="";
         }
+        found = s_offline_asr_path.find(MODEL_SVS);
+        if (found != std::string::npos) {
+            model_path["model-revision"]="v2.0.5";
+            s_lm_path="";
+            model_path[LM_DIR]="";
+        }        
 
         if (access(s_offline_asr_path.c_str(), F_OK) == 0) {
           // local
diff --git a/runtime/websocket/bin/websocket-server-2pass.cpp b/runtime/websocket/bin/websocket-server-2pass.cpp
index 8c8cab4..ff23e9d 100644
--- a/runtime/websocket/bin/websocket-server-2pass.cpp
+++ b/runtime/websocket/bin/websocket-server-2pass.cpp
@@ -111,7 +111,9 @@
     int audio_fs,
     std::string wav_format,
     FUNASR_HANDLE& tpass_online_handle,
-    FUNASR_DEC_HANDLE& decoder_handle) {
+    FUNASR_DEC_HANDLE& decoder_handle,
+    std::string svs_lang,
+    bool sys_itn) {
   // lock for each connection
   if(!tpass_online_handle){
     scoped_lock guard(thread_lock);
@@ -140,7 +142,8 @@
                                        subvector.data(), subvector.size(),
                                        punc_cache, false, audio_fs,
                                        wav_format, (ASR_TYPE)asr_mode_,
-                                       hotwords_embedding, itn, decoder_handle);
+                                       hotwords_embedding, itn, decoder_handle,
+                                       svs_lang, sys_itn);
 
         } else {
           scoped_lock guard(thread_lock);
@@ -177,7 +180,8 @@
                                        buffer.data(), buffer.size(), punc_cache,
                                        is_final, audio_fs,
                                        wav_format, (ASR_TYPE)asr_mode_,
-                                       hotwords_embedding, itn, decoder_handle);
+                                       hotwords_embedding, itn, decoder_handle,
+                                       svs_lang, sys_itn);
         } else {
           scoped_lock guard(thread_lock);
           msg["access_num"]=(int)msg["access_num"]-1;	 
@@ -250,6 +254,8 @@
     data_msg->msg["audio_fs"] = 16000; // default is 16k
     data_msg->msg["access_num"] = 0; // the number of access for this object, when it is 0, we can free it saftly
     data_msg->msg["is_eof"]=false; // if this connection is closed
+    data_msg->msg["svs_lang"]="auto";
+    data_msg->msg["svs_itn"]=true;
     FUNASR_DEC_HANDLE decoder_handle =
       FunASRWfstDecoderInit(tpass_handle, ASR_TWO_PASS, global_beam_, lattice_beam_, am_scale_);
     data_msg->decoder_handle = decoder_handle;
@@ -475,6 +481,12 @@
       if (jsonresult.contains("itn")) {
         msg_data->msg["itn"] = jsonresult["itn"];
       }
+      if (jsonresult.contains("svs_lang")) {
+        msg_data->msg["svs_lang"] = jsonresult["svs_lang"];
+      }
+      if (jsonresult.contains("svs_itn")) {
+        msg_data->msg["svs_itn"] = jsonresult["svs_itn"];
+      }
       LOG(INFO) << "jsonresult=" << jsonresult
                 << ", msg_data->msg=" << msg_data->msg;
       if ((jsonresult["is_speaking"] == false ||
@@ -499,7 +511,9 @@
                         msg_data->msg["audio_fs"],
                         msg_data->msg["wav_format"],
                         std::ref(msg_data->tpass_online_handle),
-                        std::ref(msg_data->decoder_handle)));
+                        std::ref(msg_data->decoder_handle),
+                        msg_data->msg["svs_lang"],
+                        msg_data->msg["svs_itn"]));
 		      msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1;
         }
         catch (std::exception const &e)
@@ -547,7 +561,9 @@
                                   msg_data->msg["audio_fs"],
                                   msg_data->msg["wav_format"],
                                   std::ref(msg_data->tpass_online_handle),
-                                  std::ref(msg_data->decoder_handle)));
+                                  std::ref(msg_data->decoder_handle),
+                                  msg_data->msg["svs_lang"],
+                                  msg_data->msg["svs_itn"]));
               msg_data->msg["access_num"]=(int)(msg_data->msg["access_num"])+1;
             }
           }
diff --git a/runtime/websocket/bin/websocket-server-2pass.h b/runtime/websocket/bin/websocket-server-2pass.h
index 7938f88..e61a93b 100644
--- a/runtime/websocket/bin/websocket-server-2pass.h
+++ b/runtime/websocket/bin/websocket-server-2pass.h
@@ -125,7 +125,9 @@
                   int audio_fs,
                   std::string wav_format,
                   FUNASR_HANDLE& tpass_online_handle,
-                  FUNASR_DEC_HANDLE& decoder_handle);
+                  FUNASR_DEC_HANDLE& decoder_handle,
+                  std::string svs_lang,
+                  bool sys_itn);
 
   void initAsr(std::map<std::string, std::string>& model_path, int thread_num);
   void on_message(websocketpp::connection_hdl hdl, message_ptr msg);

--
Gitblit v1.9.1