From 8f596af4be1c2e5c4e4b4a7008ba96f412d40fca Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 29 四月 2024 14:32:43 +0800
Subject: [PATCH] batch
---
funasr/datasets/large_datasets/datapipes/batch.py | 17 +++++++++--------
1 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/funasr/datasets/large_datasets/datapipes/batch.py b/funasr/datasets/large_datasets/datapipes/batch.py
index c980ae3..aeeb451 100644
--- a/funasr/datasets/large_datasets/datapipes/batch.py
+++ b/funasr/datasets/large_datasets/datapipes/batch.py
@@ -19,13 +19,13 @@
class MaxTokenBucketizerIterDataPipe(IterableDataset):
def __init__(
- self,
- datapipe,
- batch_size=8000,
- len_fn=_default_len_fn,
- buffer_size=10240,
- sort_size=500,
- batch_mode="padding",
+ self,
+ datapipe,
+ batch_size=8000,
+ len_fn=_default_len_fn,
+ buffer_size=10240,
+ sort_size=500,
+ batch_mode="padding",
):
assert batch_size > 0, "Batch size is required to be larger than 0!"
assert buffer_size >= -1, "Buffer size is required to be larger than -1!"
@@ -39,13 +39,14 @@
self.batch_mode = batch_mode
def set_epoch(self, epoch):
- self.epoch = epoch
+ self.datapipe.set_epoch(epoch)
def __iter__(self):
buffer = []
batch = []
bucket = []
max_lengths = 0
+ min_lengths = 999999
batch_lengths = 0
if self.batch_mode == "clipping":
--
Gitblit v1.9.1