From 55c09aeaa25b4bb88a50e09ba68fa6ff00a6d676 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期一, 15 一月 2024 20:10:39 +0800
Subject: [PATCH] update readme, fix seaco bug
---
funasr/utils/load_utils.py | 85 +++++++++++++++++++++++-------------------
1 files changed, 46 insertions(+), 39 deletions(-)
diff --git a/funasr/utils/load_utils.py b/funasr/utils/load_utils.py
index 7f1b850..4e131a8 100644
--- a/funasr/utils/load_utils.py
+++ b/funasr/utils/load_utils.py
@@ -9,53 +9,59 @@
import time
import logging
from torch.nn.utils.rnn import pad_sequence
-
-# def load_audio(audio_or_path_or_list, fs: int=16000, audio_fs: int=16000):
-#
-# if isinstance(audio_or_path_or_list, (list, tuple)):
-# return [load_audio(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
-#
-# if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
-# audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)
-# audio_or_path_or_list = audio_or_path_or_list[0, :]
-# elif isinstance(audio_or_path_or_list, np.ndarray): # audio sample point
-# audio_or_path_or_list = np.squeeze(audio_or_path_or_list) #[n_samples,]
-#
-# if audio_fs != fs:
-# resampler = torchaudio.transforms.Resample(audio_fs, fs)
-# audio_or_path_or_list = resampler(audio_or_path_or_list[None, :])[0, :]
-# return audio_or_path_or_list
+try:
+ from funasr.download.file import download_from_url
+except:
+ print("urllib is not installed, if you infer from url, please install it first.")
-def load_audio_and_text_image_video(audio_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type=None, tokenizer=None):
- if isinstance(audio_or_path_or_list, (list, tuple)):
+
+def load_audio_text_image_video(data_or_path_or_list, fs: int = 16000, audio_fs: int = 16000, data_type="sound", tokenizer=None, **kwargs):
+ if isinstance(data_or_path_or_list, (list, tuple)):
if data_type is not None and isinstance(data_type, (list, tuple)):
- data_types = [data_type] * len(audio_or_path_or_list)
- audio_or_path_or_list_ret = [[] for d in data_type]
- for i, (data_type_i, audio_or_path_or_list_i) in enumerate(zip(data_types, audio_or_path_or_list)):
+ data_types = [data_type] * len(data_or_path_or_list)
+ data_or_path_or_list_ret = [[] for d in data_type]
+ for i, (data_type_i, data_or_path_or_list_i) in enumerate(zip(data_types, data_or_path_or_list)):
- for j, (data_type_j, audio_or_path_or_list_j) in enumerate(zip(data_type_i, audio_or_path_or_list_i)):
+ for j, (data_type_j, data_or_path_or_list_j) in enumerate(zip(data_type_i, data_or_path_or_list_i)):
- audio_or_path_or_list_j = load_audio_and_text_image_video(audio_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer)
- audio_or_path_or_list_ret[j].append(audio_or_path_or_list_j)
+ data_or_path_or_list_j = load_audio_text_image_video(data_or_path_or_list_j, fs=fs, audio_fs=audio_fs, data_type=data_type_j, tokenizer=tokenizer, **kwargs)
+ data_or_path_or_list_ret[j].append(data_or_path_or_list_j)
- return audio_or_path_or_list_ret
+ return data_or_path_or_list_ret
else:
- return [load_audio_and_text_image_video(audio, fs=fs, audio_fs=audio_fs) for audio in audio_or_path_or_list]
+ return [load_audio_text_image_video(audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs) for audio in data_or_path_or_list]
- if isinstance(audio_or_path_or_list, str) and os.path.exists(audio_or_path_or_list):
- audio_or_path_or_list, audio_fs = torchaudio.load(audio_or_path_or_list)
- audio_or_path_or_list = audio_or_path_or_list[0, :]
- elif isinstance(audio_or_path_or_list, np.ndarray): # audio sample point
- audio_or_path_or_list = np.squeeze(audio_or_path_or_list) # [n_samples,]
- elif isinstance(audio_or_path_or_list, str) and data_type is not None and data_type == "text" and tokenizer is not None:
- audio_or_path_or_list = tokenizer.encode(audio_or_path_or_list)
+ if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith('http'): # download url to local file
+ data_or_path_or_list = download_from_url(data_or_path_or_list)
+
+ if isinstance(data_or_path_or_list, str) and os.path.exists(data_or_path_or_list): # local file
+ if data_type is None or data_type == "sound":
+ data_or_path_or_list, audio_fs = torchaudio.load(data_or_path_or_list)
+ data_or_path_or_list = data_or_path_or_list[0, :]
+ elif data_type == "text" and tokenizer is not None:
+ data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+ elif data_type == "image": # undo
+ pass
+ elif data_type == "video": # undo
+ pass
+
+ # if data_in is a file or url, set is_final=True
+ if "cache" in kwargs:
+ kwargs["cache"]["is_final"] = True
+ elif isinstance(data_or_path_or_list, str) and data_type == "text" and tokenizer is not None:
+ data_or_path_or_list = tokenizer.encode(data_or_path_or_list)
+ elif isinstance(data_or_path_or_list, np.ndarray): # audio sample point
+ data_or_path_or_list = torch.from_numpy(data_or_path_or_list).squeeze() # [n_samples,]
+ else:
+ pass
+ # print(f"unsupport data type: {data_or_path_or_list}, return raw data")
if audio_fs != fs and data_type != "text":
resampler = torchaudio.transforms.Resample(audio_fs, fs)
- audio_or_path_or_list = resampler(audio_or_path_or_list[None, :])[0, :]
- return audio_or_path_or_list
+ data_or_path_or_list = resampler(data_or_path_or_list[None, :])[0, :]
+ return data_or_path_or_list
def load_bytes(input):
middle_data = np.frombuffer(input, dtype=np.int16)
@@ -72,7 +78,7 @@
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
return array
-def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None):
+def extract_fbank(data, data_len = None, data_type: str="sound", frontend=None, **kwargs):
# import pdb;
# pdb.set_trace()
if isinstance(data, np.ndarray):
@@ -87,7 +93,7 @@
elif isinstance(data, (list, tuple)):
data_list, data_len = [], []
for data_i in data:
- if isinstance(data, np.ndarray):
+ if isinstance(data_i, np.ndarray):
data_i = torch.from_numpy(data_i)
data_list.append(data_i)
data_len.append(data_i.shape[0])
@@ -95,8 +101,9 @@
# import pdb;
# pdb.set_trace()
# if data_type == "sound":
- data, data_len = frontend(data, data_len)
+ data, data_len = frontend(data, data_len, **kwargs)
if isinstance(data_len, (list, tuple)):
data_len = torch.tensor([data_len])
- return data.to(torch.float32), data_len.to(torch.int32)
\ No newline at end of file
+ return data.to(torch.float32), data_len.to(torch.int32)
+
--
Gitblit v1.9.1