From d8b586e02cd14f7eed6b330bd4f110cb1e7f24ad Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 09 一月 2024 20:33:12 +0800
Subject: [PATCH] funasr1.0 modelscope
---
examples/industrial_data_pretraining/paraformer/infer.sh | 13 -
examples/industrial_data_pretraining/ct_transformer/demo.py | 4
examples/industrial_data_pretraining/fsmn_vad/infer.sh | 12 -
funasr/download/download_from_hub.py | 6
funasr/models/ct_transformer/model.py | 7
examples/industrial_data_pretraining/fsmn_vad/demo.py | 4
examples/industrial_data_pretraining/ct_transformer/infer.sh | 13 -
funasr/download/file.py | 16 ++
examples/industrial_data_pretraining/contextual_paraformer/demo.py | 4
examples/industrial_data_pretraining/monotonic_aligner/demo.py | 4
funasr/utils/load_utils.py | 54 ++------
examples/industrial_data_pretraining/bicif_paraformer/infer.sh | 34 ++---
funasr/bin/inference.py | 17 +-
examples/industrial_data_pretraining/emotion2vec/demo.py | 4
examples/industrial_data_pretraining/paraformer/demo.py | 8
examples/industrial_data_pretraining/contextual_paraformer/infer.sh | 13 -
examples/industrial_data_pretraining/emotion2vec/infer.sh | 14 -
examples/industrial_data_pretraining/bicif_paraformer/demo.py | 11 +
funasr/models/monotonic_aligner/model.py | 11 +
examples/industrial_data_pretraining/seaco_paraformer/infer.sh | 30 ++--
examples/industrial_data_pretraining/seaco_paraformer/demo.py | 12 +
examples/industrial_data_pretraining/monotonic_aligner/infer.sh | 13 -
22 files changed, 137 insertions(+), 167 deletions(-)
diff --git a/examples/industrial_data_pretraining/bicif_paraformer/demo.py b/examples/industrial_data_pretraining/bicif_paraformer/demo.py
index 83e024e..4a5e333 100644
--- a/examples/industrial_data_pretraining/bicif_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/bicif_paraformer/demo.py
@@ -5,10 +5,13 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
- vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
- punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+model = AutoModel(model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+ model_revision="v2.0.0",
+ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+ vad_model_revision="v2.0.0",
+ punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+ punc_model_revision="v2.0.0",
)
-res = model(input="../modelscope_models/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size_s=300, batch_size_threshold_s=60)
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav", batch_size_s=300, batch_size_threshold_s=60)
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
index 2ec237b..04cb6f2 100644
--- a/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/bicif_paraformer/infer.sh
@@ -1,27 +1,21 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-
-local_path=${local_path_root}/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-git clone https://www.modelscope.cn/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
-
-local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch
-git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad}
-
-local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch
-git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc}
-
+model="damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model_revision="v2.0.0"
+vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+vad_model_revision="v2.0.0"
+punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+punc_model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+vad_model="${local_path_vad}" \
-+punc_model="${local_path_punc}" \
-+input="${local_path}/example/asr_example.wav" \
++model=${model} \
++model_revision=${model_revision} \
++vad_model=${vad_model} \
++vad_model_revision=${vad_model_revision} \
++punc_model=${punc_model} \
++punc_model_revision=${punc_model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_vad_punc_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+batch_size_s=300 \
-+batch_size_threshold_s=60 \
-+debug="true" \
-+"hotword='杈炬懇闄� 榄旀惌'"
++batch_size_threshold_s=60
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/demo.py b/examples/industrial_data_pretraining/contextual_paraformer/demo.py
index 2e6d203..c705ca8 100644
--- a/examples/industrial_data_pretraining/contextual_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/contextual_paraformer/demo.py
@@ -5,8 +5,8 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404")
+model = AutoModel(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", model_revision="v2.0.0")
-res = model(input="../modelscope_models/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/example/asr_example.wav",
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='杈炬懇闄� 榄旀惌')
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
index 92d1a8b..158ce8a 100644
--- a/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/contextual_paraformer/infer.sh
@@ -1,14 +1,11 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404
-git clone https://www.modelscope.cn/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404.git ${local_path}
-
+model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/asr_example.wav" \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+"hotword='杈炬懇闄� 榄旀惌'"
diff --git a/examples/industrial_data_pretraining/ct_transformer/demo.py b/examples/industrial_data_pretraining/ct_transformer/demo.py
index d3b63db..58ebd2a 100644
--- a/examples/industrial_data_pretraining/ct_transformer/demo.py
+++ b/examples/industrial_data_pretraining/ct_transformer/demo.py
@@ -5,7 +5,7 @@
from funasr import AutoModel
-model = AutoModel(model="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+model = AutoModel(model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", model_revision="v2.0.0")
-res = model(input="/Users/zhifu/Downloads/modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/example/punc_example.txt")
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt")
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/ct_transformer/infer.sh b/examples/industrial_data_pretraining/ct_transformer/infer.sh
index bd8ac05..a48d562 100644
--- a/examples/industrial_data_pretraining/ct_transformer/infer.sh
+++ b/examples/industrial_data_pretraining/ct_transformer/infer.sh
@@ -1,13 +1,10 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch
-git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path}
-
+model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/punc_example.txt" \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" \
+output_dir="./outputs/debug" \
+device="cpu"
diff --git a/examples/industrial_data_pretraining/emotion2vec/demo.py b/examples/industrial_data_pretraining/emotion2vec/demo.py
index 4ce7025..3653313 100644
--- a/examples/industrial_data_pretraining/emotion2vec/demo.py
+++ b/examples/industrial_data_pretraining/emotion2vec/demo.py
@@ -5,7 +5,7 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/emotion2vec_base")
+model = AutoModel(model="damo/emotion2vec_base", model_revision="v2.0.0")
-res = model(input="../modelscope_models/emotion2vec_base/example/test.wav", output_dir="./outputs")
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", output_dir="./outputs")
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/emotion2vec/infer.sh b/examples/industrial_data_pretraining/emotion2vec/infer.sh
index 99600ca..9b98715 100644
--- a/examples/industrial_data_pretraining/emotion2vec/infer.sh
+++ b/examples/industrial_data_pretraining/emotion2vec/infer.sh
@@ -1,14 +1,10 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/emotion2vec_base
-git clone https://www.modelscope.cn/damo/emotion2vec_base.git ${local_path}
-#local_path=/Users/zhifu/Downloads/modelscope_models/emotion2vec_base
+model="damo/emotion2vec_base"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/test.wav" \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
-+debug=true
diff --git a/examples/industrial_data_pretraining/fsmn_vad/demo.py b/examples/industrial_data_pretraining/fsmn_vad/demo.py
index 6c112e2..2a157ee 100644
--- a/examples/industrial_data_pretraining/fsmn_vad/demo.py
+++ b/examples/industrial_data_pretraining/fsmn_vad/demo.py
@@ -5,7 +5,7 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch")
+model = AutoModel(model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", model_revision="v2.0.0")
-res = model(input="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav")
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav")
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/fsmn_vad/infer.sh b/examples/industrial_data_pretraining/fsmn_vad/infer.sh
index 94e1b3d..dedd14a 100644
--- a/examples/industrial_data_pretraining/fsmn_vad/infer.sh
+++ b/examples/industrial_data_pretraining/fsmn_vad/infer.sh
@@ -1,13 +1,11 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch
-git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path}
+model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/vad_example.wav" \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
diff --git a/examples/industrial_data_pretraining/monotonic_aligner/demo.py b/examples/industrial_data_pretraining/monotonic_aligner/demo.py
index 149d488..f5df457 100644
--- a/examples/industrial_data_pretraining/monotonic_aligner/demo.py
+++ b/examples/industrial_data_pretraining/monotonic_aligner/demo.py
@@ -5,9 +5,9 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+model = AutoModel(model="damo/speech_timestamp_prediction-v1-16k-offline", model_revision="v2.0.0")
-res = model(input=("../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav",
+res = model(input=("https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
"娆㈣繋澶у鏉ュ埌榄旀惌绀惧尯杩涜浣撻獙"),
data_type=("sound", "text"),
batch_size=2,
diff --git a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
index 179e714..34fd1f9 100644
--- a/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
+++ b/examples/industrial_data_pretraining/monotonic_aligner/infer.sh
@@ -1,14 +1,11 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/speech_timestamp_prediction-v1-16k-offline
- git clone https://www.modelscope.cn/damo/speech_timestamp_prediction-v1-16k-offline.git ${local_path}
-
+model="damo/speech_timestamp_prediction-v1-16k-offline"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input='["../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", "娆㈣繋澶у鏉ュ埌榄旀惌绀惧尯杩涜浣撻獙"]' \
++model=${model} \
++model_revision=${model_revision} \
++input='["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", "娆㈣繋澶у鏉ュ埌榄旀惌绀惧尯杩涜浣撻獙"]' \
+data_type='["sound", "text"]' \
+output_dir="../outputs/debug" \
+device="cpu" \
diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py
index 119e14f..1f3b9a1 100644
--- a/examples/industrial_data_pretraining/paraformer/demo.py
+++ b/examples/industrial_data_pretraining/paraformer/demo.py
@@ -5,17 +5,17 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+model = AutoModel(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0")
-res = model(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav")
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav")
print(res)
from funasr import AutoFrontend
-frontend = AutoFrontend(model="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revison="v2.0.0")
-fbanks = frontend(input="../modelscope_models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav", batch_size=2)
+fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", batch_size=2)
for batch_idx, fbank_dict in enumerate(fbanks):
res = model(**fbank_dict)
diff --git a/examples/industrial_data_pretraining/paraformer/infer.sh b/examples/industrial_data_pretraining/paraformer/infer.sh
index a9bd8cd..9436628 100644
--- a/examples/industrial_data_pretraining/paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/paraformer/infer.sh
@@ -1,14 +1,11 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-local_path=${local_path_root}/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
-
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/asr_example.wav" \
++model=${model} \
++model_revision=${model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
diff --git a/examples/industrial_data_pretraining/seaco_paraformer/demo.py b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
index 9aec94a..84be0d8 100644
--- a/examples/industrial_data_pretraining/seaco_paraformer/demo.py
+++ b/examples/industrial_data_pretraining/seaco_paraformer/demo.py
@@ -5,12 +5,14 @@
from funasr import AutoModel
-model = AutoModel(model="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+model = AutoModel(model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+ model_revision="v2.0.0",
+ vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+ vad_model_revision="v2.0.0",
+ punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+ punc_model_revision="v2.0.0",
)
-#vad_model="../modelscope_models/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-#punc_model="../modelscope_models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
-
-res = model(input="../modelscope_models/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav",
+res = model(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
hotword='杈炬懇闄� 纾ㄦ惌')
print(res)
\ No newline at end of file
diff --git a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
index 121c610..e92d598 100644
--- a/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
+++ b/examples/industrial_data_pretraining/seaco_paraformer/infer.sh
@@ -1,23 +1,19 @@
-# download model
-local_path_root=../modelscope_models
-mkdir -p ${local_path_root}
-
-local_path=${local_path_root}/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
-git clone https://www.modelscope.cn/damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path}
-
-#local_path_vad=${local_path_root}/speech_fsmn_vad_zh-cn-16k-common-pytorch
-#git clone https://www.modelscope.cn/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch.git ${local_path_vad}
-
-#local_path_punc=${local_path_root}/punc_ct-transformer_zh-cn-common-vocab272727-pytorch
-#git clone https://www.modelscope.cn/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch.git ${local_path_punc}
+model="damo/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model_revision="v2.0.0"
+vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+vad_model_revision="v2.0.0"
+punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+punc_model_revision="v2.0.0"
python funasr/bin/inference.py \
-+model="${local_path}" \
-+input="${local_path}/example/asr_example.wav" \
++model=${model} \
++model_revision=${model_revision} \
++vad_model=${vad_model} \
++vad_model_revision=${vad_model_revision} \
++punc_model=${punc_model} \
++punc_model_revision=${punc_model_revision} \
++input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" \
+output_dir="./outputs/debug" \
+device="cpu" \
+"hotword='杈炬懇闄� 榄旀惌'"
-
-#+vad_model="${local_path_vad}" \
-#+punc_model="${local_path_punc}" \
\ No newline at end of file
diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py
index 5b58907..dedaf7d 100644
--- a/funasr/bin/inference.py
+++ b/funasr/bin/inference.py
@@ -20,6 +20,7 @@
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils.vad_utils import slice_padding_audio_samples
from funasr.utils.timestamp_tools import time_stamp_sentence
+from funasr.download.file import download_from_url
def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
"""
@@ -35,7 +36,8 @@
filelist = [".scp", ".txt", ".json", ".jsonl"]
chars = string.ascii_letters + string.digits
-
+ if isinstance(data_in, str) and data_in.startswith('http'): # url
+ data_in = download_from_url(data_in)
if isinstance(data_in, str) and os.path.exists(data_in): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
_, file_extension = os.path.splitext(data_in)
file_extension = file_extension.lower()
@@ -59,7 +61,7 @@
data_list = [data_in]
key_list = [key]
elif isinstance(data_in, (list, tuple)):
- if data_type is not None and isinstance(data_type, (list, tuple)):
+ if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs
data_list_tmp = []
for data_in_i, data_type_i in zip(data_in, data_type):
key_list, data_list_i = prepare_data_iterator(data_in=data_in_i, data_type=data_type_i)
@@ -68,7 +70,7 @@
for item in zip(*data_list_tmp):
data_list.append(item)
else:
- # [audio sample point, fbank]
+ # [audio sample point, fbank, text]
data_list = data_in
key_list = ["rand_key_" + ''.join(random.choice(chars) for _ in range(13)) for _ in range(len(data_in))]
else: # raw text; audio sample point, fbank; bytes
@@ -198,13 +200,12 @@
kwargs = self.kwargs if kwargs is None else kwargs
kwargs.update(cfg)
model = self.model if model is None else model
-
- data_type = kwargs.get("data_type", "sound")
+
batch_size = kwargs.get("batch_size", 1)
# if kwargs.get("device", "cpu") == "cpu":
# batch_size = 1
- key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type, key=key)
+ key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None), key=key)
speed_stats = {}
asr_result_list = []
@@ -268,8 +269,8 @@
batch_size = int(kwargs.get("batch_size_s", 300))*1000
batch_size_threshold_ms = int(kwargs.get("batch_size_threshold_s", 60))*1000
kwargs["batch_size"] = batch_size
- data_type = kwargs.get("data_type", "sound")
- key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=data_type)
+
+ key_list, data_list = prepare_data_iterator(input, input_len=input_len, data_type=kwargs.get("data_type", None))
results_ret_list = []
time_speech_total_all_samples = 0.0
diff --git a/funasr/download/download_from_hub.py b/funasr/download/download_from_hub.py
index abf3ba0..8a4044d 100644
--- a/funasr/download/download_from_hub.py
+++ b/funasr/download/download_from_hub.py
@@ -21,8 +21,6 @@
config = os.path.join(model_or_path, "config.yaml")
if os.path.exists(config) and os.path.exists(os.path.join(model_or_path, "model.pb")):
- # config = os.path.join(model_or_path, "config.yaml")
- # assert os.path.exists(config), "{} is not exist!".format(config)
cfg = OmegaConf.load(config)
kwargs = OmegaConf.merge(cfg, kwargs)
init_param = os.path.join(model_or_path, "model.pb")
@@ -42,10 +40,10 @@
assert os.path.exists(os.path.join(model_or_path, "configuration.json"))
with open(os.path.join(model_or_path, "configuration.json"), 'r', encoding='utf-8') as f:
conf_json = json.load(f)
- config = os.path.join(model_or_path, conf_json["model"]["model_config"])
+ config = os.path.join(model_or_path, conf_json["model_config"])
cfg = OmegaConf.load(config)
kwargs = OmegaConf.merge(cfg, kwargs)
- init_param = os.path.join(model_or_path, conf_json["model"]["model_name"])
+ init_param = os.path.join(model_or_path, conf_json["model_file"])
kwargs["init_param"] = init_param
kwargs["model"] = cfg["model"]
return OmegaConf.to_container(kwargs, resolve=True)
diff --git a/funasr/download/file.py b/funasr/download/file.py
index d93f24c..da4958a 100644
--- a/funasr/download/file.py
+++ b/funasr/download/file.py
@@ -8,7 +8,23 @@
from typing import Generator, Union
import requests
+from urllib.parse import urlparse
+def download_from_url(url):
+ result = urlparse(url)
+ file_path = None
+ if result.scheme is not None and len(result.scheme) > 0:
+ storage = HTTPStorage()
+ # bytes
+ data = storage.read(url)
+ work_dir = tempfile.TemporaryDirectory().name
+ if not os.path.exists(work_dir):
+ os.makedirs(work_dir)
+ file_path = os.path.join(work_dir, os.path.basename(url))
+ with open(file_path, 'wb') as fb:
+ fb.write(data)
+ assert file_path is not None, f"failed to download: {url}"
+ return file_path
class Storage(metaclass=ABCMeta):
"""Abstract class of storage.
diff --git a/funasr/models/ct_transformer/model.py b/funasr/models/ct_transformer/model.py
index 24a6aea..e32aa25 100644
--- a/funasr/models/ct_transformer/model.py
+++ b/funasr/models/ct_transformer/model.py
@@ -11,6 +11,7 @@
import torch
import torch.nn as nn
from funasr.models.ct_transformer.utils import split_to_mini_sentence, split_words
+from funasr.utils.load_utils import load_audio_text_image_video
from funasr.register import tables
@@ -219,10 +220,10 @@
**kwargs,
):
assert len(data_in) == 1
-
+ text = load_audio_text_image_video(data_in, data_type=kwargs.get("kwargs", "text"))[0]
vad_indexes = kwargs.get("vad_indexes", None)
- text = data_in[0]
- text_lengths = data_lengths[0] if data_lengths is not None else None
+ # text = data_in[0]
+ # text_lengths = data_lengths[0] if data_lengths is not None else None
split_size = kwargs.get("split_size", 20)
tokens = split_words(text)
diff --git a/funasr/models/monotonic_aligner/model.py b/funasr/models/monotonic_aligner/model.py
index a0d745f..584b692 100644
--- a/funasr/models/monotonic_aligner/model.py
+++ b/funasr/models/monotonic_aligner/model.py
@@ -188,9 +188,12 @@
text_postprocessed, time_stamp_postprocessed, _ = postprocess_utils.sentence_postprocess(token, timestamp)
result_i = {"key": key[i], "text": text_postprocessed,
"timestamp": time_stamp_postprocessed,
- }
- # ibest_writer["token"][key[i]] = " ".join(token)
- ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
- ibest_writer["timestamp_str"][key[i]] = timestamp_str
+ }
results.append(result_i)
+
+ if ibest_writer:
+ # ibest_writer["token"][key[i]] = " ".join(token)
+ ibest_writer["timestamp_list"][key[i]] = time_stamp_postprocessed
+ ibest_writer["timestamp_str"][key[i]] = timestamp_str
+
return results, meta_data
\ No newline at end of file
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index c5c3ffc..c82987f 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -10,29 +10,13 @@
import logging
from torch.nn.utils.rnn import pad_sequence
try:
- from urllib.parse import urlparse
- from funasr.download.file import HTTPStorage
- import tempfile
+ from funasr.download.file import download_from_url
except:
print("urllib is not installed, if you infer from url, please install it first.")
-# def load_audio(data_or_path_or_list, fs: int=16000, audio_fs: int=16000):
-#
-# if isinstance(data_or_path_or_list, (list, tuple)):
-# return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list]
-#
-# if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list):
-# data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
-# data_or_path_or_list = data_or_path_or_list[0, :]
-# elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point
-# data_or_path_or_list = np.squeeze(data_or_path_or_list) #[n_samples,]
-#
-# if audio_fs != fs:
-# resampler = torchaudio.transforms.Resample(audio_fs, fs)
-# data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]
-# return data_or_path_or_list
-def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
+
+def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None):
if isinstance(data_or_path_or_list, (list, tuple)):
if data_type is not None and isinstance(data_type, (list, tuple)):
@@ -47,16 +31,22 @@
return data_or_path_or_list_ret
else:
- return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in data_or_path_or_list]
+ return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type) for audio in data_or_path_or_list]
if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'):
data_or_path_or_list = download_from_url(data_or_path_or_list)
if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list):
- data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
- data_or_path_or_list = data_or_path_or_list[0, :]
+ if data_type is None or data_type == "sound":
+ data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
+ data_or_path_or_list = data_or_path_or_list[0, :]
+ # elif data_type == "text" and tokenizer is not None:
+ # data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+ elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
+ data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point
data_or_path_or_list = np.squeeze(data_or_path_or_list) # [n_samples,]
- elif isinstance(data_or_path_or_list, str) and data_type is not None and data_type == "text" and tokenizer is not None:
- data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+ else:
+ pass
+ # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
if audio_fs != fs and data_type != "text":
resampler = torchaudio.transforms.Resample(audio_fs, fs)
@@ -107,19 +97,3 @@
data_len = torch.tensor([data_len])
return data.to(torch.float32), data_len.to(torch.int32)
-def download_from_url(url):
-
- result = urlparse(url)
- file_path = None
- if result.scheme is not None and len(result.scheme) > 0:
- storage = HTTPStorage()
- # bytes
- data = storage.read(url)
- work_dir = tempfile.TemporaryDirectory().name
- if not os.path.exists(work_dir):
- os.makedirs(work_dir)
- file_path = os.path.join(work_dir, os.path.basename(url))
- with open(file_path, 'wb') as fb:
- fb.write(data)
- assert file_path is not None, f"failed to download: {url}"
- return file_path
\ No newline at end of file
--
Gitblit v1.9.1