From 1596f6f414f6f41da66506debb1dff19fffeb3ec Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 24 六月 2024 11:55:17 +0800
Subject: [PATCH] fixbug hotwords
---
funasr/datasets/large_datasets/datapipes/batch.py | 17 +++++++++--------
1 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/funasr/datasets/large_datasets/datapipes/batch.py b/funasr/datasets/large_datasets/datapipes/batch.py
index c980ae3..aeeb451 100644
--- a/funasr/datasets/large_datasets/datapipes/batch.py
+++ b/funasr/datasets/large_datasets/datapipes/batch.py
@@ -19,13 +19,13 @@
class MaxTokenBucketizerIterDataPipe(IterableDataset):
def __init__(
- self,
- datapipe,
- batch_size=8000,
- len_fn=_default_len_fn,
- buffer_size=10240,
- sort_size=500,
- batch_mode="padding",
+ self,
+ datapipe,
+ batch_size=8000,
+ len_fn=_default_len_fn,
+ buffer_size=10240,
+ sort_size=500,
+ batch_mode="padding",
):
assert batch_size > 0, "Batch size is required to be larger than 0!"
assert buffer_size >= -1, "Buffer size is required to be larger than -1!"
@@ -39,13 +39,14 @@
self.batch_mode = batch_mode
def set_epoch(self, epoch):
- self.epoch = epoch
+ self.datapipe.set_epoch(epoch)
def __iter__(self):
buffer = []
batch = []
bucket = []
max_lengths = 0
+ min_lengths = 999999
batch_lengths = 0
if self.batch_mode == "clipping":
--
Gitblit v1.9.1