From c542eacb0aadcbc49c63db40429fca4e08f807a4 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 21 七月 2023 10:27:35 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add
---
funasr/fileio/sound_scp.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++----
1 files changed, 95 insertions(+), 8 deletions(-)
diff --git a/funasr/fileio/sound_scp.py b/funasr/fileio/sound_scp.py
index 459369e..b912f1e 100644
--- a/funasr/fileio/sound_scp.py
+++ b/funasr/fileio/sound_scp.py
@@ -1,12 +1,84 @@
import collections.abc
from pathlib import Path
-from typing import Union
+from typing import List, Tuple, Union
+import random
import numpy as np
import soundfile
-from typeguard import check_argument_types
+import librosa
+
+import torch
+import torchaudio
from funasr.fileio.read_text import read_2column_text
+
+def soundfile_read(
+ wavs: Union[str, List[str]],
+ dtype=None,
+ always_2d: bool = False,
+ concat_axis: int = 1,
+ start: int = 0,
+ end: int = None,
+ return_subtype: bool = False,
+) -> Tuple[np.array, int]:
+ if isinstance(wavs, str):
+ wavs = [wavs]
+
+ arrays = []
+ subtypes = []
+ prev_rate = None
+ prev_wav = None
+ for wav in wavs:
+ with soundfile.SoundFile(wav) as f:
+ f.seek(start)
+ if end is not None:
+ frames = end - start
+ else:
+ frames = -1
+ if dtype == "float16":
+ array = f.read(
+ frames,
+ dtype="float32",
+ always_2d=always_2d,
+ ).astype(dtype)
+ else:
+ array = f.read(frames, dtype=dtype, always_2d=always_2d)
+ rate = f.samplerate
+ subtype = f.subtype
+ subtypes.append(subtype)
+
+ if len(wavs) > 1 and array.ndim == 1 and concat_axis == 1:
+ # array: (Time, Channel)
+ array = array[:, None]
+
+ if prev_wav is not None:
+ if prev_rate != rate:
+ raise RuntimeError(
+ f"'{prev_wav}' and '{wav}' have mismatched sampling rate: "
+ f"{prev_rate} != {rate}"
+ )
+
+ dim1 = arrays[0].shape[1 - concat_axis]
+ dim2 = array.shape[1 - concat_axis]
+ if dim1 != dim2:
+ raise RuntimeError(
+ "Shapes must match with "
+ f"{1 - concat_axis} axis, but gut {dim1} and {dim2}"
+ )
+
+ prev_rate = rate
+ prev_wav = wav
+ arrays.append(array)
+
+ if len(arrays) == 1:
+ array = arrays[0]
+ else:
+ array = np.concatenate(arrays, axis=concat_axis)
+
+ if return_subtype:
+ return array, rate, subtypes
+ else:
+ return array, rate
class SoundScpReader(collections.abc.Mapping):
@@ -30,23 +102,39 @@
dtype=np.int16,
always_2d: bool = False,
normalize: bool = False,
+ dest_sample_rate: int = 16000,
+ speed_perturb: Union[list, tuple] = None,
):
- assert check_argument_types()
self.fname = fname
self.dtype = dtype
self.always_2d = always_2d
self.normalize = normalize
self.data = read_2column_text(fname)
+ self.dest_sample_rate = dest_sample_rate
+ self.speed_perturb = speed_perturb
def __getitem__(self, key):
wav = self.data[key]
if self.normalize:
# soundfile.read normalizes data to [-1,1] if dtype is not given
- array, rate = soundfile.read(wav, always_2d=self.always_2d)
- else:
- array, rate = soundfile.read(
- wav, dtype=self.dtype, always_2d=self.always_2d
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=self.always_2d
)
+ else:
+ array, rate = librosa.load(
+ wav, sr=self.dest_sample_rate, mono=self.always_2d, dtype=self.dtype
+ )
+
+ if self.speed_perturb is not None:
+ speed = random.choice(self.speed_perturb)
+ if speed != 1.0:
+ array, _ = torchaudio.sox_effects.apply_effects_tensor(
+ torch.tensor(array).view(1, -1), rate,
+ [['speed', str(speed)], ['rate', str(rate)]])
+ array = array.view(-1).numpy()
+
+ if array.ndim==2:
+ array=array.transpose((1, 0))
return rate, array
@@ -89,7 +177,6 @@
format="wav",
dtype=None,
):
- assert check_argument_types()
self.dir = Path(outdir)
self.dir.mkdir(parents=True, exist_ok=True)
scpfile = Path(scpfile)
--
Gitblit v1.9.1