From 8ab670afc3b7a9ae4da4043e21e89024321d5b5b Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 05 三月 2024 11:40:26 +0800
Subject: [PATCH] Dev gzf (#1424)

---
 funasr/models/whisper/model.py                                   |   15 +++++--
 examples/industrial_data_pretraining/whisper/demo.py             |    2 
 funasr/download/name_maps_from_hub.py                            |    3 +
 README_zh.md                                                     |   26 +++++++------
 examples/industrial_data_pretraining/whisper/demo_from_openai.py |    4 +-
 examples/industrial_data_pretraining/whisper/infer_from_local.sh |   12 ++++-
 README.md                                                        |   26 +++++++------
 7 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index d436d5e..507306f 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@
 
 <a name="whats-new"></a>
 ## What's new:
+- 2024/03/05锛欰dded support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary), and [openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper).
 - 2024/03/03: Offline File Transcription Service 4.4, Offline File Transcription Service of English 1.5锛孯eal-time Transcription Service 1.9 released锛孌ocker image supports ARM64 platform锛�([docs](runtime/readme.md))
 - 2024/01/30锛歠unasr-1.0 has been released ([docs](https://github.com/alibaba-damo-academy/FunASR/discussions/1319))
 - 2024/01/30锛歟motion recognition models are new supported. [model link](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary), modified from [repo](https://github.com/ddlBoJack/emotion2vec).
@@ -67,20 +68,21 @@
 ## Model Zoo
 FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the [Model License Agreement](./MODEL_LICENSE). Below are some representative models, for more models please refer to the [Model Zoo]().
 
-(Note: 猸� represents the ModelScope model zoo link, 馃 represents the Huggingface model zoo link)
+(Note: 猸� represents the ModelScope model zoo, 馃 represents the Huggingface model zoo, 馃崁 represents the OpenAI model zoo)
 
 
-|                                                                                                         Model Name                                                                                                         |                    Task Details                    |          Training Data           | Parameters |
-|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|:--------------------------------:|:----------:|
-|          paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃](https://huggingface.co/funasr/paraformer-tp) )           | speech recognition, with timestamps, non-streaming |      60000 hours, Mandarin       |    220M    |
-| <nobr>paraformer-zh-streaming <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃](https://huggingface.co/funasr/paraformer-zh-streaming) )</nobr> |           speech recognition, streaming            |      60000 hours, Mandarin       |    220M    |
-|               paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃](https://huggingface.co/funasr/paraformer-en) )                | speech recognition, with timestamps, non-streaming |       50000 hours, English       |    220M    |
-|                            conformer-en <br> ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃](https://huggingface.co/funasr/conformer-en) )                             |         speech recognition, non-streaming          |       50000 hours, English       |    220M    |
-|                               ct-punc <br> ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃](https://huggingface.co/funasr/ct-punc) )                               |              punctuation restoration               |    100M, Mandarin and English    |    1.1G    | 
-|                                   fsmn-vad <br> ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃](https://huggingface.co/funasr/fsmn-vad) )                                   |              voice activity detection              | 5000 hours, Mandarin and English |    0.4M    | 
-|                                     fa-zh <br> ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃](https://huggingface.co/funasr/fa-zh) )                                     |                timestamp prediction                |       5000 hours, Mandarin       |    38M     | 
-|                                       cam++ <br> ( [猸怾(https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [馃](https://huggingface.co/funasr/campplus) )                                        |        speaker verification/diarization            |            5000 hours            |    7.2M    | 
-|                                                 whisper-large-v2 <br> ([猸怾(https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [馃]() )                                                   | speech recognition, with timestamps, non-streaming |          multilingual            |     1G     |
+|                                                                                                         Model Name                                                                                                         |                     Task Details                      |          Training Data           | Parameters |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------:|:--------------------------------:|:----------:|
+|          paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃](https://huggingface.co/funasr/paraformer-tp) )           |  speech recognition, with timestamps, non-streaming   |      60000 hours, Mandarin       |    220M    |
+| <nobr>paraformer-zh-streaming <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃](https://huggingface.co/funasr/paraformer-zh-streaming) )</nobr> |             speech recognition, streaming             |      60000 hours, Mandarin       |    220M    |
+|               paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃](https://huggingface.co/funasr/paraformer-en) )                | speech recognition, without timestamps, non-streaming |       50000 hours, English       |    220M    |
+|                            conformer-en <br> ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃](https://huggingface.co/funasr/conformer-en) )                             |           speech recognition, non-streaming           |       50000 hours, English       |    220M    |
+|                               ct-punc <br> ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃](https://huggingface.co/funasr/ct-punc) )                               |                punctuation restoration                |    100M, Mandarin and English    |    1.1G    | 
+|                                   fsmn-vad <br> ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃](https://huggingface.co/funasr/fsmn-vad) )                                   |               voice activity detection                | 5000 hours, Mandarin and English |    0.4M    | 
+|                                     fa-zh <br> ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃](https://huggingface.co/funasr/fa-zh) )                                     |                 timestamp prediction                  |       5000 hours, Mandarin       |    38M     | 
+|                                       cam++ <br> ( [猸怾(https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [馃](https://huggingface.co/funasr/campplus) )                                        |           speaker verification/diarization            |            5000 hours            |    7.2M    | 
+|                                                  Whisper-large-v2 <br> ([猸怾(https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [馃崁](https://github.com/openai/whisper) )                                                  |  speech recognition, with timestamps, non-streaming   |          multilingual            |    1.5G    |
+|                                                Whisper-large-v3 <br> ([猸怾(https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [馃崁](https://github.com/openai/whisper) )                                                 |  speech recognition, with timestamps, non-streaming   |          multilingual            |    1.5G    |
 
 
 
diff --git a/README_zh.md b/README_zh.md
index 8a34c82..a6e1ac7 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -29,6 +29,7 @@
 
 <a name="鏈�鏂板姩鎬�"></a>
 ## 鏈�鏂板姩鎬�
+- 2024/03/05锛氭柊澧炲姞Whisper-large-v3妯″瀷鏀寔锛屽璇█璇煶璇嗗埆/缈昏瘧/璇璇嗗埆锛屾敮鎸佷粠[modelscope](https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)浠撳簱涓嬭浇锛屼篃鏀寔浠嶽openai](https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining/whisper)浠撳簱涓嬭浇妯″瀷銆�
 - 2024/03/03: 涓枃绂荤嚎鏂囦欢杞啓鏈嶅姟 4.4銆佽嫳鏂囩绾挎枃浠惰浆鍐欐湇鍔� 1.5銆佷腑鏂囧疄鏃惰闊冲惉鍐欐湇鍔� 1.9 鍙戝竷锛宒ocker闀滃儚鏀寔arm64骞冲彴锛涜缁嗕俊鎭弬闃�([閮ㄧ讲鏂囨。](runtime/readme_cn.md))
 - 2024/01/30锛歠unasr-1.0鍙戝竷锛屾洿鏂拌鏄嶽鏂囨。](https://github.com/alibaba-damo-academy/FunASR/discussions/1319)
 - 2024/01/30锛氭柊澧炲姞鎯呮劅璇嗗埆 [妯″瀷閾炬帴](https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary)锛屽師濮嬫ā鍨� [repo](https://github.com/ddlBoJack/emotion2vec).
@@ -69,20 +70,21 @@
 
 FunASR寮�婧愪簡澶ч噺鍦ㄥ伐涓氭暟鎹笂棰勮缁冩ā鍨嬶紝鎮ㄥ彲浠ュ湪[妯″瀷璁稿彲鍗忚](./MODEL_LICENSE)涓嬭嚜鐢变娇鐢ㄣ�佸鍒躲�佷慨鏀瑰拰鍒嗕韩FunASR妯″瀷锛屼笅闈㈠垪涓句唬琛ㄦ�х殑妯″瀷锛屾洿澶氭ā鍨嬭鍙傝�僛妯″瀷浠撳簱]()銆�
 
-锛堟敞锛氣瓙 琛ㄧずModelScope妯″瀷浠撳簱閾炬帴锛岎煠� 琛ㄧずHuggingface妯″瀷浠撳簱閾炬帴锛�
+锛堟敞锛氣瓙 琛ㄧずModelScope妯″瀷浠撳簱锛岎煠� 琛ㄧずHuggingface妯″瀷浠撳簱锛岎煃�琛ㄧずOpenAI妯″瀷浠撳簱锛�
 
 
-|                                         妯″瀷鍚嶅瓧                                                                                                                 |      浠诲姟璇︽儏       |     璁粌鏁版嵁     | 鍙傛暟閲�  |
-|:------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:------------:|:----:|
-| paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃](https://huggingface.co/funasr/paraformer-tp) ) | 璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |  60000灏忔椂锛屼腑鏂�  | 220M |
-|   paraformer-zh-streaming <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃](https://huggingface.co/funasr/paraformer-zh-streaming) )   |     璇煶璇嗗埆锛屽疄鏃�     |  60000灏忔椂锛屼腑鏂�  | 220M |
-|      paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃](https://huggingface.co/funasr/paraformer-en) )      |    璇煶璇嗗埆锛岄潪瀹炴椂     |  50000灏忔椂锛岃嫳鏂�  | 220M |
-|                  conformer-en <br> ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃](https://huggingface.co/funasr/conformer-en) )                   |    璇煶璇嗗埆锛岄潪瀹炴椂     |  50000灏忔椂锛岃嫳鏂�  | 220M |
-|                  ct-punc <br> ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃](https://huggingface.co/funasr/ct-punc) )                   |      鏍囩偣鎭㈠       |  100M锛屼腑鏂囦笌鑻辨枃  | 1.1G | 
-|                       fsmn-vad <br> ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃](https://huggingface.co/funasr/fsmn-vad) )                       |    璇煶绔偣妫�娴嬶紝瀹炴椂    | 5000灏忔椂锛屼腑鏂囦笌鑻辨枃 | 0.4M | 
-|                       fa-zh <br> ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃](https://huggingface.co/funasr/fa-zh) )                        |    瀛楃骇鍒椂闂存埑棰勬祴     |  50000灏忔椂锛屼腑鏂�  | 38M  |
-|                           cam++ <br> ( [猸怾(https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [馃](https://huggingface.co/funasr/campplus) )                            |    璇磋瘽浜虹‘璁�/鍒嗗壊     |    5000灏忔椂    | 7.2M | 
-| whisper-large-v2 <br> ([猸怾(https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [馃]() ) | 璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |     澶氳瑷�      |  1G  |
+|                                                                                                     妯″瀷鍚嶅瓧                                                                                                      |      浠诲姟璇︽儏       |     璁粌鏁版嵁     | 鍙傛暟閲�  | 
+|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:------------:|:----:|
+|    paraformer-zh <br> ([猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)  [馃](https://huggingface.co/funasr/paraformer-tp) )    | 璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |  60000灏忔椂锛屼腑鏂�  | 220M |
+| paraformer-zh-streaming <br> ( [猸怾(https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) [馃](https://huggingface.co/funasr/paraformer-zh-streaming) ) |     璇煶璇嗗埆锛屽疄鏃�     |  60000灏忔椂锛屼腑鏂�  | 220M |
+|         paraformer-en <br> ( [猸怾(https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary) [馃](https://huggingface.co/funasr/paraformer-en) )         |    璇煶璇嗗埆锛岄潪瀹炴椂     |  50000灏忔椂锛岃嫳鏂�  | 220M |
+|                      conformer-en <br> ( [猸怾(https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary) [馃](https://huggingface.co/funasr/conformer-en) )                      |    璇煶璇嗗埆锛岄潪瀹炴椂     |  50000灏忔椂锛岃嫳鏂�  | 220M |
+|                        ct-punc <br> ( [猸怾(https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary) [馃](https://huggingface.co/funasr/ct-punc) )                         |      鏍囩偣鎭㈠       |  100M锛屼腑鏂囦笌鑻辨枃  | 1.1G | 
+|                            fsmn-vad <br> ( [猸怾(https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) [馃](https://huggingface.co/funasr/fsmn-vad) )                             |    璇煶绔偣妫�娴嬶紝瀹炴椂    | 5000灏忔椂锛屼腑鏂囦笌鑻辨枃 | 0.4M | 
+|                              fa-zh <br> ( [猸怾(https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) [馃](https://huggingface.co/funasr/fa-zh) )                               |    瀛楃骇鍒椂闂存埑棰勬祴     |  50000灏忔椂锛屼腑鏂�  | 38M  |
+|                                 cam++ <br> ( [猸怾(https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) [馃](https://huggingface.co/funasr/campplus) )                                 |    璇磋瘽浜虹‘璁�/鍒嗗壊     |    5000灏忔椂    | 7.2M | 
+|                           Whisper-large-v2 <br> ([猸怾(https://www.modelscope.cn/models/iic/speech_whisper-large_asr_multilingual/summary)  [馃崁](https://github.com/openai/whisper) )                           | 璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |     澶氳瑷�      |  1G  |
+|                         Whisper-large-v3 <br> ([猸怾(https://www.modelscope.cn/models/iic/Whisper-large-v3/summary)  [馃崁](https://github.com/openai/whisper) )                          | 璇煶璇嗗埆锛屽甫鏃堕棿鎴宠緭鍑猴紝闈炲疄鏃� |     澶氳瑷�      |  1G  |
 
 
 <a name="蹇�熷紑濮�"></a>
diff --git a/examples/industrial_data_pretraining/whisper/demo.py b/examples/industrial_data_pretraining/whisper/demo.py
index f010ea2..db8d92c 100644
--- a/examples/industrial_data_pretraining/whisper/demo.py
+++ b/examples/industrial_data_pretraining/whisper/demo.py
@@ -5,7 +5,7 @@
 
 from funasr import AutoModel
 
-model = AutoModel(model="iic/speech_whisper-large_asr_multilingual",
+model = AutoModel(model="iic/Whisper-large-v3",
                   model_revision="v2.0.4",
                   )
 
diff --git a/examples/industrial_data_pretraining/whisper/demo_from_openai.py b/examples/industrial_data_pretraining/whisper/demo_from_openai.py
index 0b88a95..046e9c6 100644
--- a/examples/industrial_data_pretraining/whisper/demo_from_openai.py
+++ b/examples/industrial_data_pretraining/whisper/demo_from_openai.py
@@ -7,8 +7,8 @@
 
 # model = AutoModel(model="Whisper-small", hub="openai")
 # model = AutoModel(model="Whisper-medium", hub="openai")
-model = AutoModel(model="Whisper-large-v2", hub="openai")
-# model = AutoModel(model="Whisper-large-v3", hub="openai")
+# model = AutoModel(model="Whisper-large-v2", hub="openai")
+model = AutoModel(model="Whisper-large-v3", hub="openai")
 
 res = model.generate(
 	language=None,
diff --git a/examples/industrial_data_pretraining/whisper/infer_from_local.sh b/examples/industrial_data_pretraining/whisper/infer_from_local.sh
index 14ccbb6..885dfc6 100644
--- a/examples/industrial_data_pretraining/whisper/infer_from_local.sh
+++ b/examples/industrial_data_pretraining/whisper/infer_from_local.sh
@@ -13,13 +13,19 @@
 # download model
 local_path_root=${workspace}/modelscope_models
 mkdir -p ${local_path_root}
-local_path=${local_path_root}/speech_whisper-large_asr_multilingual
-git clone https://www.modelscope.cn/iic/speech_whisper-large_asr_multilingual.git ${local_path}
+#Whisper-large-v2
+#local_path=${local_path_root}/speech_whisper-large_asr_multilingual
+#git clone https://www.modelscope.cn/iic/speech_whisper-large_asr_multilingual.git ${local_path}
+#init_param="${local_path}/large-v2.pt"
+#Whisper-large-v3
+local_path=${local_path_root}/Whisper-large-v3
+git clone https://www.modelscope.cn/iic/Whisper-large-v3.git ${local_path}
+init_param="${local_path}/large-v3.pt"
 
 device="cuda:0" # "cuda:0" for gpu0, "cuda:1" for gpu1, "cpu"
 
 config="config.yaml"
-init_param="${local_path}/large-v2.pt"
+
 
 python -m funasr.bin.inference \
 --config-path "${local_path}" \
diff --git a/funasr/download/name_maps_from_hub.py b/funasr/download/name_maps_from_hub.py
index e1bc295..07cf6a0 100644
--- a/funasr/download/name_maps_from_hub.py
+++ b/funasr/download/name_maps_from_hub.py
@@ -8,7 +8,8 @@
     "ct-punc-c": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
     "fa-zh": "damo/speech_timestamp_prediction-v1-16k-offline",
     "cam++": "damo/speech_campplus_sv_zh-cn_16k-common",
-    "whisper-large-v2": "iic/speech_whisper-large_asr_multilingual",
+    "Whisper-large-v2": "iic/speech_whisper-large_asr_multilingual",
+    "Whisper-large-v3": "iic/Whisper-large-v3",
 }
 
 name_maps_hf = {
diff --git a/funasr/models/whisper/model.py b/funasr/models/whisper/model.py
index 1eac2ff..73d70d7 100644
--- a/funasr/models/whisper/model.py
+++ b/funasr/models/whisper/model.py
@@ -24,7 +24,7 @@
 @tables.register("model_classes", "Whisper-large-v1")
 @tables.register("model_classes", "Whisper-large-v2")
 @tables.register("model_classes", "Whisper-large-v3")
-@tables.register("model_classes", "Whisper-WhisperWarp")
+@tables.register("model_classes", "WhisperWarp")
 class WhisperWarp(nn.Module):
     def __init__(self, *args, **kwargs):
         super().__init__()
@@ -35,8 +35,8 @@
                 model_or_path = model_or_path.replace("Whisper-", "")
             model = whisper.load_model(model_or_path)
         else:
-            whisper_dims = kwargs.get("whisper_dims", {})
-            dims = whisper.model.ModelDimensions(**whisper_dims)
+            dims = kwargs.get("dims", {})
+            dims = whisper.model.ModelDimensions(**dims)
             model = whisper.model.Whisper(dims=dims)
         
         self.model = model
@@ -55,6 +55,13 @@
         if kwargs.get("batch_size", 1) > 1:
             raise NotImplementedError("batch decoding is not implemented")
 
+        if frontend is None and not hasattr(self, "frontend"):
+            frontend_class = tables.frontend_classes.get("WhisperFrontend")
+            frontend = frontend_class(n_mels=self.model.dims.n_mels, do_pad_trim=kwargs.get("do_pad_trim", True))
+            self.frontend = frontend
+        else:
+            frontend = frontend if frontend is not None else self.frontend
+
         meta_data = {}
         if isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank":  # fbank
             speech, speech_lengths = data_in, data_lengths
@@ -65,7 +72,7 @@
         else:
             # extract fbank feats
             time1 = time.perf_counter()
-            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
+            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs if hasattr(frontend, "fs") else 16000, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
                                                             tokenizer=tokenizer)
             time2 = time.perf_counter()

--
Gitblit v1.9.1