From a016617c7ec98ab9c7475ff7d3b6150b98d5beeb Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 28 二月 2023 18:36:52 +0800
Subject: [PATCH] Merge pull request #165 from alibaba-damo-academy/dev_cmz
---
funasr/datasets/large_datasets/dataset.py | 23 +++++++++++++++++------
1 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 55b0678..61231d2 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -1,9 +1,10 @@
import os
import random
-import soundfile
+import numpy
from functools import partial
import torch
+import torchaudio
import torch.distributed as dist
from kaldiio import ReadHelper
from torch.utils.data import IterableDataset
@@ -117,21 +118,26 @@
sample_dict["key"] = key
elif data_type == "sound":
key, path = item.strip().split()
- mat, sampling_rate = soundfile.read(path)
+ waveform, sampling_rate = torchaudio.load(path)
+ waveform = waveform.numpy()
+ mat = waveform[0]
sample_dict[data_name] = mat
sample_dict["sampling_rate"] = sampling_rate
if data_name == "speech":
sample_dict["key"] = key
else:
text = item
- sample_dict[data_name] = text.strip().split()[1:]
+ segs = text.strip().split()
+ sample_dict[data_name] = segs[1:]
+ if "key" not in sample_dict:
+ sample_dict["key"] = segs[0]
yield sample_dict
self.close_reader(reader_list)
def len_fn_example(data):
- return len(data)
+ return 1
def len_fn_token(data):
@@ -145,6 +151,7 @@
def Dataset(data_list_file,
dict,
seg_dict,
+ punc_dict,
conf,
mode="train",
batch_mode="padding"):
@@ -159,7 +166,7 @@
dataset = FilterIterDataPipe(dataset, fn=filter_fn)
if "text" in data_names:
- vocab = {'vocab': dict, 'seg_dict': seg_dict}
+ vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict}
tokenize_fn = partial(tokenize, **vocab)
dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
@@ -188,6 +195,10 @@
sort_size=sort_size,
batch_mode=batch_mode)
- dataset = MapperIterDataPipe(dataset, fn=padding if batch_mode == "padding" else clipping)
+ int_pad_value = conf.get("int_pad_value", -1)
+ float_pad_value = conf.get("float_pad_value", 0.0)
+ padding_conf = {"int_pad_value": int_pad_value, "float_pad_value": float_pad_value}
+ padding_fn = partial(padding, **padding_conf)
+ dataset = MapperIterDataPipe(dataset, fn=padding_fn if batch_mode == "padding" else clipping)
return dataset
--
Gitblit v1.9.1