From 5b7c0c17b476b5cf659ca7371a0c3af5d13e8b50 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 26 七月 2023 14:54:17 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add

---
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt           |    3 
 funasr/runtime/python/onnxruntime/demo_punc_offline.py                                                          |    3 
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md                         |    1 
 funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py                                                       |   13 ++
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh                          |   68 +++++++++++++
 funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py                                                    |   61 ++++++++++++
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py |    2 
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt             |    3 
 funasr/datasets/preprocessor.py                                                                                 |    3 
 egs_modelscope/punctuation/TEMPLATE/infer.sh                                                                    |    4 
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils                             |    1 
 funasr/runtime/websocket/readme.md                                                                              |    2 
 egs/aishell/branchformer/README.md                                                                              |   15 +++
 egs/aishell/e_branchformer/README.md                                                                            |   15 +++
 egs_modelscope/punctuation/TEMPLATE/infer.py                                                                    |    4 
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py                          |   25 +++++
 egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py                           |   22 ++++
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils                           |    1 
 18 files changed, 234 insertions(+), 12 deletions(-)

diff --git a/egs/aishell/branchformer/README.md b/egs/aishell/branchformer/README.md
new file mode 100644
index 0000000..90bba97
--- /dev/null
+++ b/egs/aishell/branchformer/README.md
@@ -0,0 +1,15 @@
+# Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+- Decode config: conf/decode_asr_transformer.yaml (ctc weight:0.4)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.15   |
+|    test     |  4.51   |
\ No newline at end of file
diff --git a/egs/aishell/e_branchformer/README.md b/egs/aishell/e_branchformer/README.md
new file mode 100644
index 0000000..9b6b238
--- /dev/null
+++ b/egs/aishell/e_branchformer/README.md
@@ -0,0 +1,15 @@
+# E-Branchformer Result
+
+## Training Config
+- Feature info: using raw speech, extracting 80 dims fbank online, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train info: lr 0.001, batch_size 10000, 4 gpu(Tesla V100), acc_grad 1, 180 epochs
+- Train config: conf/train_asr_e_branchformer.yaml
+- LM config: LM was not used
+
+## Results (CER)
+- Decode config: conf/decode_asr_transformer.yaml (ctc weight:0.4)
+
+|   testset   | CER(%)  |
+|:-----------:|:-------:|
+|     dev     |  4.10   |
+|    test     |  4.52   |
\ No newline at end of file
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
index 369c863..510e5ed 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -8,7 +8,7 @@
         task=Tasks.auto_speech_recognition,
         model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
         vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+        punc_model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
         output_dir=output_dir,
     )
     rec_result = inference_pipeline(audio_in=audio_in, batch_size_token=5000, batch_size_token_threshold_s=40)
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.py b/egs_modelscope/punctuation/TEMPLATE/infer.py
index edcefbe..eeea170 100644
--- a/egs_modelscope/punctuation/TEMPLATE/infer.py
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.py
@@ -15,9 +15,9 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
     parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
     parser.add_argument('--output_dir', type=str, default="./results/")
     parser.add_argument('--gpuid', type=str, default="0")
     args = parser.parse_args()
-    modelscope_infer(args)
\ No newline at end of file
+    modelscope_infer(args)
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.sh b/egs_modelscope/punctuation/TEMPLATE/infer.sh
index 0af502e..03bf3f4 100644
--- a/egs_modelscope/punctuation/TEMPLATE/infer.sh
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.sh
@@ -7,7 +7,7 @@
 stage=1
 stop_stage=2
 model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-data_dir="./data/test"
+data_dir="./data"
 output_dir="./results"
 gpu_inference=true    # whether to perform gpu decoding
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
@@ -32,7 +32,7 @@
 for JOB in $(seq ${nj}); do
     split_scps="$split_scps $output_dir/split/text.$JOB.scp"
 done
-perl utils/split_scp.pl ${data_dir}/punc.txt ${split_scps}
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
 
 if [ -n "${checkpoint_dir}" ]; then
   python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt
new file mode 100644
index 0000000..367be79
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt
@@ -0,0 +1,3 @@
+1	璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
+2	浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
+3	閭d粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py
new file mode 100644
index 0000000..070f2ee
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py
@@ -0,0 +1,22 @@
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
+    model_revision="v1.0.0",
+    output_dir="./tmp/"
+)
+
+##################text.scp###################
+# inputs = "./egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt"
+
+##################text#####################
+#inputs = "鎴戜滑閮芥槸鏈ㄥご浜轰笉浼氳璇濅笉浼氬姩"
+
+##################text file url#######################
+inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+
+rec_result = inference_pipeline(text_in=inputs)
+print(rec_result)
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py
new file mode 100644
index 0000000..47503c2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.punctuation,
+        model=args.model,
+        model_revision=args.model_revision,        
+        output_dir=args.output_dir,
+    )
+    inference_pipeline(text_in=args.text_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
+    parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
+    parser.add_argument('--model_revision', type=str, default=None)
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh
new file mode 100644
index 0000000..93296a5
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/punc_ct-transformer_cn-en-common-vocab471067-large"
+model_revision="v1.0.0"
+data_dir="./data"
+output_dir="./results"
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="punc.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/text.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --text_in ${output_dir}/split/text.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --model_revision ${model_revision}
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/final_res
+    if [ -f "${output_dir}/output.1/infer.out" ]; then
+      for i in $(seq "${nj}"); do
+          cat "${output_dir}/output.${i}/infer.out"
+      done | sort -k1 >"${output_dir}/final_res/infer.out"
+    fi
+fi
+
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
new file mode 100644
index 0000000..367be79
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
@@ -0,0 +1,3 @@
+1	璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
+2	浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
+3	閭d粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index bd2c972..c6623f8 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -11,7 +11,7 @@
 import numpy as np
 import scipy.signal
 import soundfile
-
+import jieba
 
 from funasr.text.build_tokenizer import build_tokenizer
 from funasr.text.cleaner import TextCleaner
@@ -659,7 +659,6 @@
         self.split_text_name = split_text_name
         self.seg_jieba = seg_jieba
         if self.seg_jieba:
-            import jieba
             jieba.load_userdict(seg_dict_file)
 
     @classmethod
diff --git a/funasr/runtime/python/onnxruntime/demo_punc_offline.py b/funasr/runtime/python/onnxruntime/demo_punc_offline.py
index dabbb8e..c56b484 100644
--- a/funasr/runtime/python/onnxruntime/demo_punc_offline.py
+++ b/funasr/runtime/python/onnxruntime/demo_punc_offline.py
@@ -1,6 +1,7 @@
 from funasr_onnx import CT_Transformer
 
-model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+#model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model_dir = "damo/punc_ct-transformer_cn-en-common-vocab471067-large"
 model = CT_Transformer(model_dir)
 
 text_in="璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭"
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py b/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
index 777de4f..cc5daa8 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
@@ -10,7 +10,7 @@
 from .utils.utils import (ONNXRuntimeError,
                           OrtInferSession, get_logger,
                           read_yaml)
-from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words)
+from .utils.utils import (TokenIDConverter, split_to_mini_sentence,code_mix_split_words,code_mix_split_words_jieba)
 logging = get_logger()
 
 
@@ -65,9 +65,18 @@
                 self.punc_list[i] = "锛�"
             elif self.punc_list[i] == "銆�":
                 self.period = i
+        if "seg_jieba" in config:
+            self.seg_jieba = True
+            self.jieba_usr_dict_path = os.path.join(model_dir, 'jieba_usr_dict')
+            self.code_mix_split_words_jieba = code_mix_split_words_jieba(self.jieba_usr_dict_path)
+        else:
+            self.seg_jieba = False
 
     def __call__(self, text: Union[list, str], split_size=20):
-        split_text = code_mix_split_words(text)
+        if self.seg_jieba:
+            split_text = self.code_mix_split_words_jieba(text)
+        else:
+            split_text = code_mix_split_words(text)
         split_text_id = self.converter.tokens2ids(split_text)
         mini_sentences = split_to_mini_sentence(split_text, split_size)
         mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
index 9284943..170126d 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
@@ -6,11 +6,12 @@
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
 
+import re
 import numpy as np
 import yaml
 from onnxruntime import (GraphOptimizationLevel, InferenceSession,
                          SessionOptions, get_available_providers, get_device)
-
+import jieba
 import warnings
 
 root_dir = Path(__file__).resolve().parent
@@ -230,6 +231,64 @@
             words.append(current_word)
     return words
 
+def isEnglish(text:str):
+    if re.search('^[a-zA-Z\']+$', text):
+        return True
+    else:
+        return False
+
+def join_chinese_and_english(input_list):
+    line = ''
+    for token in input_list:
+        if isEnglish(token):
+            line = line + ' ' + token
+        else:
+            line = line + token
+
+    line = line.strip()
+    return line
+
+def code_mix_split_words_jieba(seg_dict_file: str):
+    jieba.load_userdict(seg_dict_file)
+
+    def _fn(text: str):
+        input_list = text.split()
+        token_list_all = []
+        langauge_list = []
+        token_list_tmp = []
+        language_flag = None
+        for token in input_list:
+            if isEnglish(token) and language_flag == 'Chinese':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('Chinese')
+                token_list_tmp = []
+            elif not isEnglish(token) and language_flag == 'English':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('English')
+                token_list_tmp = []
+    
+            token_list_tmp.append(token)
+    
+            if isEnglish(token):
+                language_flag = 'English'
+            else:
+                language_flag = 'Chinese'
+    
+        if token_list_tmp:
+            token_list_all.append(token_list_tmp)
+            langauge_list.append(language_flag)
+    
+        result_list = []
+        for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+            if language_flag == 'English':
+                result_list.extend(token_list_tmp)
+            else:
+                seg_list = jieba.cut(join_chinese_and_english(token_list_tmp), HMM=False)
+                result_list.extend(seg_list)
+    
+        return result_list
+    return _fn
+
 def read_yaml(yaml_path: Union[str, Path]) -> Dict:
     if not Path(yaml_path).exists():
         raise FileExistsError(f'The {yaml_path} does not exist.')
diff --git a/funasr/runtime/websocket/readme.md b/funasr/runtime/websocket/readme.md
index 3dc28ae..12d255c 100644
--- a/funasr/runtime/websocket/readme.md
+++ b/funasr/runtime/websocket/readme.md
@@ -38,7 +38,7 @@
 wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/sample/funasr_samples.tar.gz
 ```
 
-We take the Python language client as an example to explain. It supports various audio formats (.wav, .pcm, .mp3, etc.), video input (.mp4, etc.), and multi-file list wav.scp input. For other versions of clients, please refer to the ([docs](##client-usage)).
+We take the Python language client as an example to explain. It supports various audio formats (.wav, .pcm, .mp3, etc.), video input (.mp4, etc.), and multi-file list wav.scp input. For other versions of clients, please refer to the ([docs](#client-usage)).
 
 ```shell
 python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"

--
Gitblit v1.9.1