From 55c09aeaa25b4bb88a50e09ba68fa6ff00a6d676 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期一, 15 一月 2024 20:10:39 +0800
Subject: [PATCH] update readme, fix seaco bug

---
 funasr/datasets/audio_datasets/datasets.py |   27 +++++++++++++++++++++++----
 1 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/funasr/datasets/audio_datasets/datasets.py b/funasr/datasets/audio_datasets/datasets.py
index 353a3a0..7839ff9 100644
--- a/funasr/datasets/audio_datasets/datasets.py
+++ b/funasr/datasets/audio_datasets/datasets.py
@@ -8,11 +8,14 @@
 import time
 import logging
 
-from funasr.datasets.audio_datasets.load_audio_extract_fbank import load_audio, extract_fbank
-from funasr.utils.register import register_class, registry_tables
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+from funasr.register import tables
 
-@register_class("dataset_classes", "AudioDataset")
+@tables.register("dataset_classes", "AudioDataset")
 class AudioDataset(torch.utils.data.Dataset):
+	"""
+	AudioDataset
+	"""
 	def __init__(self,
 	             path,
 	             index_ds: str = None,
@@ -22,8 +25,19 @@
 	             float_pad_value: float = 0.0,
 	              **kwargs):
 		super().__init__()
-		index_ds_class = registry_tables.index_ds_classes.get(index_ds.lower())
+		index_ds_class = tables.index_ds_classes.get(index_ds)
 		self.index_ds = index_ds_class(path)
+		preprocessor_speech = kwargs.get("preprocessor_speech", None)
+		if preprocessor_speech:
+			preprocessor_speech_class = tables.preprocessor_speech_classes.get(preprocessor_speech)
+			preprocessor_speech = preprocessor_speech_class(**kwargs.get("preprocessor_speech_conf"))
+		self.preprocessor_speech = preprocessor_speech
+		preprocessor_text = kwargs.get("preprocessor_text", None)
+		if preprocessor_text:
+			preprocessor_text_class = tables.preprocessor_text_classes.get(preprocessor_text)
+			preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf"))
+		self.preprocessor_text = preprocessor_text
+		
 		self.frontend = frontend
 		self.fs = 16000 if frontend is None else frontend.fs
 		self.data_type = "sound"
@@ -49,8 +63,13 @@
 		# pdb.set_trace()
 		source = item["source"]
 		data_src = load_audio(source, fs=self.fs)
+		if self.preprocessor_speech:
+			data_src = self.preprocessor_speech(data_src)
 		speech, speech_lengths = extract_fbank(data_src, data_type=self.data_type, frontend=self.frontend) # speech: [b, T, d]
+
 		target = item["target"]
+		if self.preprocessor_text:
+			target = self.preprocessor_text(target)
 		ids = self.tokenizer.encode(target)
 		ids_lengths = len(ids)
 		text, text_lengths = torch.tensor(ids, dtype=torch.int64), torch.tensor([ids_lengths], dtype=torch.int32)

--
Gitblit v1.9.1