From 4afdd97df4f8859a6c64814676dd2795c330437c Mon Sep 17 00:00:00 2001
From: zhuyunfeng <10596244@qq.com>
Date: 星期六, 11 三月 2023 13:23:34 +0800
Subject: [PATCH] Add finetune resampling function under small data type.
---
funasr/fileio/sound_scp.py | 11 ++++++++---
funasr/tasks/abs_task.py | 1 +
funasr/datasets/dataset.py | 10 +++++++---
3 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/funasr/datasets/dataset.py b/funasr/datasets/dataset.py
index 2af93d0..d1777a3 100644
--- a/funasr/datasets/dataset.py
+++ b/funasr/datasets/dataset.py
@@ -107,7 +107,7 @@
return value[()]
-def sound_loader(path, float_dtype=None):
+def sound_loader(path, dest_sample_rate=16000, float_dtype=None):
# The file is as follows:
# utterance_id_A /some/where/a.wav
# utterance_id_B /some/where/a.flac
@@ -115,7 +115,7 @@
# NOTE(kamo): SoundScpReader doesn't support pipe-fashion
# like Kaldi e.g. "cat a.wav |".
# NOTE(kamo): The audio signal is normalized to [-1,1] range.
- loader = SoundScpReader(path, normalize=True, always_2d=False)
+ loader = SoundScpReader(path, dest_sample_rate=16000, normalize=True, always_2d=False)
# SoundScpReader.__getitem__() returns Tuple[int, ndarray],
# but ndarray is desired, so Adapter class is inserted here
@@ -139,7 +139,7 @@
DATA_TYPES = {
"sound": dict(
func=sound_loader,
- kwargs=["float_dtype"],
+ kwargs=["dest_sample_rate","float_dtype"],
help="Audio format types which supported by sndfile wav, flac, etc."
"\n\n"
" utterance_id_a a.wav\n"
@@ -282,6 +282,7 @@
int_dtype: str = "long",
max_cache_size: Union[float, int, str] = 0.0,
max_cache_fd: int = 0,
+ dest_sample_rate: int = 16000,
):
assert check_argument_types()
if len(path_name_type_list) == 0:
@@ -295,6 +296,7 @@
self.float_dtype = float_dtype
self.int_dtype = int_dtype
self.max_cache_fd = max_cache_fd
+ self.dest_sample_rate = dest_sample_rate
self.loader_dict = {}
self.debug_info = {}
@@ -335,6 +337,8 @@
for key2 in dic["kwargs"]:
if key2 == "loader_type":
kwargs["loader_type"] = loader_type
+ elif key2 == "dest_sample_rate" and loader_type=="sound":
+ kwargs["dest_sample_rate"] = self.dest_sample_rate
elif key2 == "float_dtype":
kwargs["float_dtype"] = self.float_dtype
elif key2 == "int_dtype":
diff --git a/funasr/fileio/sound_scp.py b/funasr/fileio/sound_scp.py
index 459369e..dc872b0 100644
--- a/funasr/fileio/sound_scp.py
+++ b/funasr/fileio/sound_scp.py
@@ -4,6 +4,7 @@
import numpy as np
import soundfile
+import librosa
from typeguard import check_argument_types
from funasr.fileio.read_text import read_2column_text
@@ -30,6 +31,7 @@
dtype=np.int16,
always_2d: bool = False,
normalize: bool = False,
+ dest_sample_rate: int = 16000,
):
assert check_argument_types()
self.fname = fname
@@ -37,15 +39,18 @@
self.always_2d = always_2d
self.normalize = normalize
self.data = read_2column_text(fname)
+ self.dest_sample_rate = dest_sample_rate
def __getitem__(self, key):
wav = self.data[key]
if self.normalize:
# soundfile.read normalizes data to [-1,1] if dtype is not given
- array, rate = soundfile.read(wav, always_2d=self.always_2d)
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=not self.always_2d
+ )
else:
- array, rate = soundfile.read(
- wav, dtype=self.dtype, always_2d=self.always_2d
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=not self.always_2d, dtype=self.dtype
)
return rate, array
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index 723a67c..e0884ce 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -1576,6 +1576,7 @@
preprocess=iter_options.preprocess_fn,
max_cache_size=iter_options.max_cache_size,
max_cache_fd=iter_options.max_cache_fd,
+ dest_sample_rate=args.frontend_conf["fs"],
)
cls.check_task_requirements(
dataset, args.allow_variable_data_keys, train=iter_options.train
--
Gitblit v1.9.1