From 26d642bfdf59a50365a9c8158acb223cae1004dc Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 23 四月 2024 20:13:44 +0800
Subject: [PATCH] Dev gzf exp (#1651)
---
funasr/datasets/audio_datasets/samplers.py | 11 +++++++++--
1 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/funasr/datasets/audio_datasets/samplers.py b/funasr/datasets/audio_datasets/samplers.py
index c274f75..1394f7e 100644
--- a/funasr/datasets/audio_datasets/samplers.py
+++ b/funasr/datasets/audio_datasets/samplers.py
@@ -301,6 +301,7 @@
batch_type="token",
num_replicas=None,
rank=None,
+ rank_split=False,
shuffle=True,
drop_last=False,
is_training: bool = True,
@@ -314,6 +315,12 @@
except:
rank = 0
num_replicas = 1
+
+ # if rank_split:
+ # logging.info(f"Warning, rank_split: {rank_split}, batch and shuffle data in local rank")
+ # rank = 0
+ # num_replicas = 1
+
self.rank = rank
self.num_replicas = num_replicas
self.dataset = dataset
@@ -324,7 +331,7 @@
self.drop_last = drop_last
self.total_size = len(self.dataset)
- # self.num_samples = int(math.ceil(self.total_size / self.num_replicas))
+ self.num_samples = int(math.ceil(self.total_size / self.num_replicas))
self.epoch = 0
self.sort_size = sort_size * num_replicas
self.max_token_length = kwargs.get("max_token_length", 2048)
@@ -350,7 +357,7 @@
max_len_in_batch = 0
for idx in buffer:
original_sample_length = self.dataset.get_source_len(idx)
- if original_sample_length > self.max_sample_length:
+ if original_sample_length > self.max_token_length:
continue
sample_length = 1 if self.batch_type == "example" else original_sample_length
potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1)
--
Gitblit v1.9.1