游雁
2024-01-05 4f98546f3693482f8f34aa5f11ced31381c58724
load_audio_text_image_video
12个文件已修改
52 ■■■■ 已修改文件
funasr/bin/inference.py 6 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/datasets/audio_datasets/datasets.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/bicif_paraformer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/contextual_paraformer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/fsmn_vad/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/monotonic_aligner/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/paraformer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/paraformer_streaming/model.py 6 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/seaco_paraformer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/transducer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/models/transformer/model.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/utils/load_utils.py 6 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/inference.py
@@ -17,7 +17,7 @@
import string
from funasr.register import tables
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils.vad_utils import slice_padding_audio_samples
from funasr.utils.timestamp_tools import time_stamp_sentence
@@ -278,7 +278,7 @@
            key = res[i]["key"]
            vadsegments = res[i]["value"]
            input_i = data_list[i]
            speech = load_audio_and_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000))
            speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000))
            speech_lengths = len(speech)
            n = len(vadsegments)
            data_with_index = [(vadsegments[i], i) for i in range(n)]
@@ -417,7 +417,7 @@
            # extract fbank feats
            time1 = time.perf_counter()
            audio_sample_list = load_audio_and_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
            audio_sample_list = load_audio_text_image_video(data_batch, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/datasets/audio_datasets/datasets.py
@@ -8,7 +8,7 @@
import time
import logging
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.register import tables
@tables.register("dataset_classes", "AudioDataset")
funasr/models/bicif_paraformer/model.py
@@ -23,7 +23,7 @@
from funasr.models.paraformer.search import Hypothesis
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
@@ -243,7 +243,7 @@
        else:
            # extract fbank feats
            time1 = time.perf_counter()
            audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/models/contextual_paraformer/model.py
@@ -46,7 +46,7 @@
    @contextmanager
    def autocast(enabled=True):
        yield
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
@@ -337,7 +337,7 @@
        
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/models/fsmn_vad/model.py
@@ -9,7 +9,7 @@
from typing import Optional
import time
from funasr.register import tables
from funasr.utils.load_utils import load_audio_and_text_image_video,extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video,extract_fbank
from funasr.utils.datadir_writer import DatadirWriter
from torch.nn.utils.rnn import pad_sequence
@@ -544,7 +544,7 @@
        else:
            # extract fbank feats
            time1 = time.perf_counter()
            audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/models/monotonic_aligner/model.py
@@ -13,7 +13,7 @@
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
from funasr.models.ctc.ctc import CTC
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
@tables.register("model_classes", "monotonicaligner")
@@ -154,7 +154,7 @@
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_list, text_token_int_list = load_audio_and_text_image_video(data_in,
        audio_list, text_token_int_list = load_audio_text_image_video(data_in,
                                                                            fs=frontend.fs, 
                                                                            audio_fs=kwargs.get("fs", 16000), 
                                                                            data_type=kwargs.get("data_type", "sound"), 
funasr/models/paraformer/model.py
@@ -22,7 +22,7 @@
from torch.cuda.amp import autocast
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
@@ -466,7 +466,7 @@
        else:
            # extract fbank feats
            time1 = time.perf_counter()
            audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer)
            audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer)
            time2 = time.perf_counter()
            meta_data["load_data"] = f"{time2 - time1:0.3f}"
            speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend)
funasr/models/paraformer_streaming/model.py
@@ -40,7 +40,7 @@
    @contextmanager
    def autocast(enabled=True):
        yield
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
@@ -483,7 +483,7 @@
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
@@ -761,7 +761,7 @@
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/models/seaco_paraformer/model.py
@@ -35,7 +35,7 @@
    @contextmanager
    def autocast(enabled=True):
        yield
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
@@ -327,7 +327,7 @@
        
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
funasr/models/transducer/model.py
@@ -45,7 +45,7 @@
    @contextmanager
    def autocast(enabled=True):
        yield
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.models.transformer.utils.nets_utils import get_transducer_task_io
@@ -517,7 +517,7 @@
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
funasr/models/transformer/model.py
@@ -12,7 +12,7 @@
from funasr.metrics.compute_acc import th_accuracy
# from funasr.models.e2e_asr_common import ErrorCalculator
from funasr.train_utils.device_funcs import force_gatherable
from funasr.utils.load_utils import load_audio_and_text_image_video, extract_fbank
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
from funasr.utils import postprocess_utils
from funasr.utils.datadir_writer import DatadirWriter
from funasr.register import tables
@@ -392,7 +392,7 @@
        meta_data = {}
        # extract fbank feats
        time1 = time.perf_counter()
        audio_sample_list = load_audio_and_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        audio_sample_list = load_audio_text_image_video(data_in, fs=self.frontend.fs, audio_fs=kwargs.get("fs", 16000))
        time2 = time.perf_counter()
        meta_data["load_data"] = f"{time2 - time1:0.3f}"
        speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=self.frontend)
funasr/utils/load_utils.py
@@ -27,7 +27,7 @@
#     return audio_or_path_or_list
def load_audio_and_text_image_video(audio_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
def load_audio_text_image_video(audio_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
    if isinstance(audio_or_path_or_list, (list, tuple)):
        if data_type is not None and isinstance(data_type, (list, tuple)):
@@ -37,12 +37,12 @@
                
                for j, (data_type_j, audio_or_path_or_list_j) in enumerate(zip(data_type_i, audio_or_path_or_list_i)):
                    
                    audio_or_path_or_list_j = load_audio_and_text_image_video(audio_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer)
                    audio_or_path_or_list_j = load_audio_text_image_video(audio_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer)
                    audio_or_path_or_list_ret[j].append(audio_or_path_or_list_j)
            return audio_or_path_or_list_ret
        else:
            return [load_audio_and_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
            return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
    
    if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
        audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)