From e9d2cfc3a134b00f4e98271fbee3838d1ccecbcc Mon Sep 17 00:00:00 2001
From: VirtuosoQ <2416050435@qq.com>
Date: 星期五, 26 四月 2024 14:59:30 +0800
Subject: [PATCH] FunASR java http client
---
funasr/datasets/audio_datasets/espnet_samplers.py | 18 ++++++++++++------
1 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/funasr/datasets/audio_datasets/espnet_samplers.py b/funasr/datasets/audio_datasets/espnet_samplers.py
index 1524a6a..6b38bc2 100644
--- a/funasr/datasets/audio_datasets/espnet_samplers.py
+++ b/funasr/datasets/audio_datasets/espnet_samplers.py
@@ -32,8 +32,9 @@
def __init__(self, dataset,
batch_size,
batch_type="token",
- num_replicas=None,
rank=None,
+ num_replicas=None,
+ rank_split=False,
shuffle=True,
drop_last=False,
is_training: bool = True,
@@ -47,6 +48,10 @@
except:
rank = 0
num_replicas = 1
+ # if rank_split:
+ # logging.info(f"Warning, rank_split: {rank_split}, batch and shuffle data in local rank")
+ # rank = 0
+ # num_replicas = 1
self.rank = rank
self.num_replicas = num_replicas
self.dataset = dataset
@@ -56,16 +61,17 @@
self.shuffle = shuffle and is_training
self.drop_last = drop_last
- # self.total_size = len(self.dataset)
- # self.num_samples = int(math.ceil(self.total_size / self.num_replicas))
+ self.total_size = len(self.dataset)
+ self.num_samples = int(math.ceil(self.total_size / self.num_replicas))
self.epoch = 0
self.sort_size = sort_size * num_replicas
self.max_token_length = kwargs.get("max_token_length", 2048)
+ self.min_token_length = kwargs.get("min_token_length", 0)
self.length_scale_source = kwargs.get("length_scale_source", 1.0)
- super().__init__(dataset, num_replicas=num_replicas, rank=rank,
- shuffle=shuffle, drop_last=drop_last)
+ # super().__init__(dataset, num_replicas=num_replicas, rank=rank,
+ # shuffle=shuffle, drop_last=drop_last)
def __iter__(self):
if self.shuffle:
g = torch.Generator()
@@ -85,7 +91,7 @@
for idx in sorted_indices:
original_sample_length = self.dataset.get_source_len(idx)
- if original_sample_length > self.max_token_length: # Skip samples that exceed the max length
+ if original_sample_length < self.min_token_length or original_sample_length > self.max_token_length: # Skip samples that exceed the max length
continue
# Set sample_length based on the batch type
sample_length = 1 if self.batch_type == "example" else original_sample_length
--
Gitblit v1.9.1