From a016617c7ec98ab9c7475ff7d3b6150b98d5beeb Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 28 二月 2023 18:36:52 +0800
Subject: [PATCH] Merge pull request #165 from alibaba-damo-academy/dev_cmz

---
 funasr/datasets/large_datasets/dataset.py |   19 ++++++++++++++-----
 1 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 8a2f2e6..61231d2 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -1,6 +1,6 @@
 import os
 import random
-import soundfile
+import numpy
 from functools import partial
 
 import torch
@@ -119,6 +119,7 @@
                     elif data_type == "sound":
                         key, path = item.strip().split()
                         waveform, sampling_rate = torchaudio.load(path)
+                        waveform = waveform.numpy()
                         mat = waveform[0]
                         sample_dict[data_name] = mat
                         sample_dict["sampling_rate"] = sampling_rate
@@ -126,14 +127,17 @@
                             sample_dict["key"] = key
                     else:
                         text = item
-                        sample_dict[data_name] = text.strip().split()[1:]
+                        segs = text.strip().split()
+                        sample_dict[data_name] = segs[1:]
+                        if "key" not in sample_dict:
+                            sample_dict["key"] = segs[0]
                 yield sample_dict
 
             self.close_reader(reader_list)
 
 
 def len_fn_example(data):
-    return len(data)
+    return 1
 
 
 def len_fn_token(data):
@@ -147,6 +151,7 @@
 def Dataset(data_list_file,
             dict,
             seg_dict,
+            punc_dict,
             conf,
             mode="train",
             batch_mode="padding"):
@@ -161,7 +166,7 @@
     dataset = FilterIterDataPipe(dataset, fn=filter_fn)
 
     if "text" in data_names:
-        vocab = {'vocab': dict, 'seg_dict': seg_dict}
+        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict}
         tokenize_fn = partial(tokenize, **vocab)
         dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
 
@@ -190,6 +195,10 @@
                                              sort_size=sort_size,
                                              batch_mode=batch_mode)
 
-    dataset = MapperIterDataPipe(dataset, fn=padding if batch_mode == "padding" else clipping)
+    int_pad_value = conf.get("int_pad_value", -1)
+    float_pad_value = conf.get("float_pad_value", 0.0)
+    padding_conf = {"int_pad_value": int_pad_value, "float_pad_value": float_pad_value}
+    padding_fn = partial(padding, **padding_conf)
+    dataset = MapperIterDataPipe(dataset, fn=padding_fn if batch_mode == "padding" else clipping)
 
     return dataset

--
Gitblit v1.9.1