From edcd1a7292ce5f7c4a05d4144cfb15ab102f93db Mon Sep 17 00:00:00 2001
From: mengzhe.cmz <mengzhe.cmz@alibaba-inc.com>
Date: 星期二, 25 七月 2023 15:41:48 +0800
Subject: [PATCH] large punc model modelscope pipeline
---
egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt | 3 +
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt | 3 +
funasr/datasets/preprocessor.py | 3
egs_modelscope/punctuation/TEMPLATE/infer.sh | 4
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils | 1
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md | 1
egs_modelscope/punctuation/TEMPLATE/infer.py | 4
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py | 25 ++++++++
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py | 22 +++++++
egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils | 1
egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh | 68 ++++++++++++++++++++++
11 files changed, 129 insertions(+), 6 deletions(-)
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.py b/egs_modelscope/punctuation/TEMPLATE/infer.py
index edcefbe..eeea170 100644
--- a/egs_modelscope/punctuation/TEMPLATE/infer.py
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.py
@@ -15,9 +15,9 @@
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+ parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--gpuid', type=str, default="0")
args = parser.parse_args()
- modelscope_infer(args)
\ No newline at end of file
+ modelscope_infer(args)
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.sh b/egs_modelscope/punctuation/TEMPLATE/infer.sh
index 0af502e..03bf3f4 100644
--- a/egs_modelscope/punctuation/TEMPLATE/infer.sh
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.sh
@@ -7,7 +7,7 @@
stage=1
stop_stage=2
model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-data_dir="./data/test"
+data_dir="./data"
output_dir="./results"
gpu_inference=true # whether to perform gpu decoding
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
@@ -32,7 +32,7 @@
for JOB in $(seq ${nj}); do
split_scps="$split_scps $output_dir/split/text.$JOB.scp"
done
-perl utils/split_scp.pl ${data_dir}/punc.txt ${split_scps}
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
if [ -n "${checkpoint_dir}" ]; then
python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt
new file mode 100644
index 0000000..367be79
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt
@@ -0,0 +1,3 @@
+1 璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
+2 浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
+3 閭d粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py
new file mode 100644
index 0000000..070f2ee
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/demo.py
@@ -0,0 +1,22 @@
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+ task=Tasks.punctuation,
+ model='damo/punc_ct-transformer_cn-en-common-vocab471067-large',
+ model_revision="v1.0.0",
+ output_dir="./tmp/"
+)
+
+##################text.scp###################
+# inputs = "./egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/data/punc_example.txt"
+
+##################text#####################
+#inputs = "鎴戜滑閮芥槸鏈ㄥご浜轰笉浼氳璇濅笉浼氬姩"
+
+##################text file url#######################
+inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+
+rec_result = inference_pipeline(text_in=inputs)
+print(rec_result)
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py
new file mode 100644
index 0000000..47503c2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+ inference_pipeline = pipeline(
+ task=Tasks.punctuation,
+ model=args.model,
+ model_revision=args.model_revision,
+ output_dir=args.output_dir,
+ )
+ inference_pipeline(text_in=args.text_in)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_cn-en-common-vocab471067-large")
+ parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
+ parser.add_argument('--model_revision', type=str, default=None)
+ parser.add_argument('--output_dir', type=str, default="./results/")
+ parser.add_argument('--gpuid', type=str, default="0")
+ args = parser.parse_args()
+ modelscope_infer(args)
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh
new file mode 100644
index 0000000..93296a5
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/infer.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/punc_ct-transformer_cn-en-common-vocab471067-large"
+model_revision="v1.0.0"
+data_dir="./data"
+output_dir="./results"
+gpu_inference=true # whether to perform gpu decoding
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="punc.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+ nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+ nj=$njob
+ gpuid_list=""
+ for JOB in $(seq ${nj}); do
+ gpuid_list=$gpuid_list"-1,"
+ done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+ split_scps="$split_scps $output_dir/split/text.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/punc_example.txt ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+ python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+ model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+ echo "Decoding ..."
+ gpuid_list_array=(${gpuid_list//,/ })
+ for JOB in $(seq ${nj}); do
+ {
+ id=$((JOB-1))
+ gpuid=${gpuid_list_array[$id]}
+ mkdir -p ${output_dir}/output.$JOB
+ python infer.py \
+ --model ${model} \
+ --text_in ${output_dir}/split/text.$JOB.scp \
+ --output_dir ${output_dir}/output.$JOB \
+ --model_revision ${model_revision}
+ --gpuid ${gpuid}
+ }&
+ done
+ wait
+
+ mkdir -p ${output_dir}/final_res
+ if [ -f "${output_dir}/output.1/infer.out" ]; then
+ for i in $(seq "${nj}"); do
+ cat "${output_dir}/output.${i}/infer.out"
+ done | sort -k1 >"${output_dir}/final_res/infer.out"
+ fi
+fi
+
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_cn-en-common-vocab471067-large/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
new file mode 100644
index 0000000..367be79
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
@@ -0,0 +1,3 @@
+1 璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
+2 浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
+3 閭d粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index bd2c972..c6623f8 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -11,7 +11,7 @@
import numpy as np
import scipy.signal
import soundfile
-
+import jieba
from funasr.text.build_tokenizer import build_tokenizer
from funasr.text.cleaner import TextCleaner
@@ -659,7 +659,6 @@
self.split_text_name = split_text_name
self.seg_jieba = seg_jieba
if self.seg_jieba:
- import jieba
jieba.load_userdict(seg_dict_file)
@classmethod
--
Gitblit v1.9.1