From 4f98546f3693482f8f34aa5f11ced31381c58724 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 05 一月 2024 16:55:07 +0800
Subject: [PATCH] load_audio_text_image_video
---
funasr/bin/inference.py | 6 +++---
funasr/models/paraformer/model.py | 4 ++--
funasr/models/seaco_paraformer/model.py | 4 ++--
funasr/models/fsmn_vad/model.py | 4 ++--
funasr/models/contextual_paraformer/model.py | 4 ++--
funasr/models/paraformer_streaming/model.py | 6 +++---
funasr/models/bicif_paraformer/model.py | 4 ++--
funasr/models/transducer/model.py | 4 ++--
funasr/utils/load_utils.py | 6 +++---
funasr/models/monotonic_aligner/model.py | 4 ++--
funasr/models/transformer/model.py | 4 ++--
funasr/datasets/audio_datasets/datasets.py | 2 +-
12 files changed, 26 insertions(+), 26 deletions(-)
diff --git a/funasr/bin/inference.py b/funasr/bin/inference.py
index be7cf88..1676c30 100644
--- a/funasr/bin/inference.py
+++ b/funasr/bin/inference.py
@@ -17,7 +17,7 @@
import string
from funasr.register import tables
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils.vad_utils import slice_padding_audio_samples
from funasr.utils.timestamp_tools import time_stamp_sentence
@@ -278,7 +278,7 @@
key = res[i]["key"]
vadsegments = res[i]["value"]
input_i = data_list[i]
- speech = load_audio_and_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000))
+ speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000))
speech_lengths = len(speech)
n = len(vadsegments)
data_with_index = [(vadsegments[i], i) for i in range(n)]
@@ -417,7 +417,7 @@
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/datasets/audio_datasets/datasets.py b/funasr/datasets/audio_datasets/datasets.py
index 0139c93..ff82856 100644
--- a/funasr/datasets/audio_datasets/datasets.py
+++ b/funasr/datasets/audio_datasets/datasets.py
@@ -8,7 +8,7 @@
import time
import logging
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.register import tables
@tables.register("dataset_classes", "AudioDataset")
diff --git a/funasr/models/bicif_paraformer/model.py b/funasr/models/bicif_paraformer/model.py
index aced088..aea0597 100644
--- a/funasr/models/bicif_paraformer/model.py
+++ b/funasr/models/bicif_paraformer/model.py
@@ -23,7 +23,7 @@
from funasr.models.paraformer.search import Hypothesis
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
@@ -243,7 +243,7 @@
else:
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/contextual_paraformer/model.py b/funasr/models/contextual_paraformer/model.py
index c277ffc..6fdf2dc 100644
--- a/funasr/models/contextual_paraformer/model.py
+++ b/funasr/models/contextual_paraformer/model.py
@@ -46,7 +46,7 @@
@contextmanager
def autocast(enabled=True):
yield
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
@@ -337,7 +337,7 @@
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/fsmn_vad/model.py b/funasr/models/fsmn_vad/model.py
index ee19558..f6e0488 100644
--- a/funasr/models/fsmn_vad/model.py
+++ b/funasr/models/fsmn_vad/model.py
@@ -9,7 +9,7 @@
from typing import Optional
import time
from funasr.register import tables
-from funasr.utils.load_utils import load_audio_and_text_image_video,extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video,extract_fbank
from funasr.utils.datadir_writer import DatadirWriter
from torch.nn.utils.rnn import pad_sequence
@@ -544,7 +544,7 @@
else:
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/monotonic_aligner/model.py b/funasr/models/monotonic_aligner/model.py
index 368d1a4..a0d745f 100644
--- a/funasr/models/monotonic_aligner/model.py
+++ b/funasr/models/monotonic_aligner/model.py
@@ -13,7 +13,7 @@
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
from funasr.models.ctc.ctc import CTC
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
@tables.register("model_classes", "monotonicaligner")
@@ -154,7 +154,7 @@
meta_data = {}
# extract fbank feats
time1 = time.perf_counter()
- audio_list, text_token_int_list = load_audio_and_text_image_video(data_in,
+ audio_list, text_token_int_list = load_audio_text_image_video(data_in,
fs=frontend.fs,
audio_fs=kwargs.get("fs", 16000),
data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/paraformer/model.py b/funasr/models/paraformer/model.py
index 84571a4..9ee4dfc 100644
--- a/funasr/models/paraformer/model.py
+++ b/funasr/models/paraformer/model.py
@@ -22,7 +22,7 @@
from torch.cuda.amp import autocast
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
@@ -466,7 +466,7 @@
else:
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer)
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer)
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend)
diff --git a/funasr/models/paraformer_streaming/model.py b/funasr/models/paraformer_streaming/model.py
index a57c927..e57bc34 100644
--- a/funasr/models/paraformer_streaming/model.py
+++ b/funasr/models/paraformer_streaming/model.py
@@ -40,7 +40,7 @@
@contextmanager
def autocast(enabled=True):
yield
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
@@ -483,7 +483,7 @@
meta_data = {}
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
@@ -761,7 +761,7 @@
meta_data = {}
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py
index 2de125a..070b622 100644
--- a/funasr/models/seaco_paraformer/model.py
+++ b/funasr/models/seaco_paraformer/model.py
@@ -35,7 +35,7 @@
@contextmanager
def autocast(enabled=True):
yield
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
@@ -327,7 +327,7 @@
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
diff --git a/funasr/models/transducer/model.py b/funasr/models/transducer/model.py
index 9d9ae4b..1b33b6c 100644
--- a/funasr/models/transducer/model.py
+++ b/funasr/models/transducer/model.py
@@ -45,7 +45,7 @@
@contextmanager
def autocast(enabled=True):
yield
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.models.transformer.utils.nets_utils import get_transducer_task_io
@@ -517,7 +517,7 @@
meta_data = {}
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
diff --git a/funasr/models/transformer/model.py b/funasr/models/transformer/model.py
index f09f460..4e91751 100644
--- a/funasr/models/transformer/model.py
+++ b/funasr/models/transformer/model.py
@@ -12,7 +12,7 @@
from funasr.metrics.compute_acc import th_accuracy
# from funasr.models.e2e_asr_common import ErrorCalculator
from funasr.train_utils.device_funcs import force_gatherable
-from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
@@ -392,7 +392,7 @@
meta_data = {}
# extract fbank feats
time1 = time.perf_counter()
- audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
+ audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 7f1b850..637e1d2 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -27,7 +27,7 @@
# return audio_or_path_or_list
-def load_audio_and_text_image_video(audio_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
+def load_audio_text_image_video(audio_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
if isinstance(audio_or_path_or_list, (list, tuple)):
if data_type is not None and isinstance(data_type, (list, tuple)):
@@ -37,12 +37,12 @@
for j, (data_type_j, audio_or_path_or_list_j) in enumerate(zip(data_type_i, audio_or_path_or_list_i)):
- audio_or_path_or_list_j = load_audio_and_text_image_video(audio_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer)
+ audio_or_path_or_list_j = load_audio_text_image_video(audio_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer)
audio_or_path_or_list_ret[j].append(audio_or_path_or_list_j)
return audio_or_path_or_list_ret
else:
- return [load_audio_and_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
+ return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)
--
Gitblit v1.9.1