From 33d3d2084403fd34b79c835d2f2fe04f6cd8f738 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 13 九月 2023 09:33:54 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR add
---
funasr/datasets/large_datasets/dataset.py | 37 ++++++++++++++++++++++++-------------
1 files changed, 24 insertions(+), 13 deletions(-)
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 68b63e1..6c166a5 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -6,6 +6,8 @@
import torch
import torch.distributed as dist
import torchaudio
+import numpy as np
+import soundfile
from kaldiio import ReadHelper
from torch.utils.data import IterableDataset
@@ -123,7 +125,14 @@
sample_dict["key"] = key
elif data_type == "sound":
key, path = item.strip().split()
- waveform, sampling_rate = torchaudio.load(path)
+ try:
+ waveform, sampling_rate = torchaudio.load(path)
+ except:
+ waveform, sampling_rate = soundfile.read(path, dtype='float32')
+ if waveform.ndim == 2:
+ waveform = waveform[:, 0]
+ waveform = np.expand_dims(waveform, axis=0)
+ waveform = torch.tensor(waveform)
if self.frontend_conf is not None:
if sampling_rate != self.frontend_conf["fs"]:
waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
@@ -193,14 +202,7 @@
data_types = conf.get("data_types", "kaldi_ark,text")
pre_hwfile = conf.get("pre_hwlist", None)
- pre_prob = conf.get("pre_prob", 0) # unused yet
-
- hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
- "double_rate": conf.get("double_rate", 0.1),
- "hotword_min_length": conf.get("hotword_min_length", 2),
- "hotword_max_length": conf.get("hotword_max_length", 8),
- "pre_prob": conf.get("pre_prob", 0.0)}
-
+ # pre_prob = conf.get("pre_prob", 0) # unused yet
if pre_hwfile is not None:
pre_hwlist = []
with open(pre_hwfile, 'r') as fin:
@@ -208,6 +210,15 @@
pre_hwlist.append(line.strip())
else:
pre_hwlist = None
+
+ hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
+ "double_rate": conf.get("double_rate", 0.1),
+ "hotword_min_length": conf.get("hotword_min_length", 2),
+ "hotword_max_length": conf.get("hotword_max_length", 8),
+ "pre_prob": conf.get("pre_prob", 0.0),
+ "pre_hwlist": pre_hwlist}
+
+
dataset = AudioDataset(scp_lists,
data_names,
@@ -218,15 +229,15 @@
mode=mode,
)
- filter_conf = conf.get('filter_conf', {})
- filter_fn = partial(filter, **filter_conf)
- dataset = FilterIterDataPipe(dataset, fn=filter_fn)
-
if "text" in data_names:
vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
tokenize_fn = partial(tokenize, **vocab)
dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
+ filter_conf = conf.get('filter_conf', {})
+ filter_fn = partial(filter, **filter_conf)
+ dataset = FilterIterDataPipe(dataset, fn=filter_fn)
+
if shuffle:
buffer_conf = conf.get('shuffle_conf', {})
buffer_size = buffer_conf['shuffle_size']
--
Gitblit v1.9.1