From a883f2342aae400552b628bab973d8a30346a613 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 29 四月 2024 22:49:35 +0800
Subject: [PATCH] batch

---
 funasr/datasets/audio_datasets/update_jsonl.py    |    4 ++--
 funasr/datasets/audio_datasets/espnet_samplers.py |    7 +++++--
 funasr/datasets/audio_datasets/index_ds.py        |    2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/funasr/datasets/audio_datasets/espnet_samplers.py b/funasr/datasets/audio_datasets/espnet_samplers.py
index cb30a28..e155cd7 100644
--- a/funasr/datasets/audio_datasets/espnet_samplers.py
+++ b/funasr/datasets/audio_datasets/espnet_samplers.py
@@ -71,7 +71,7 @@
         self.max_token_length = kwargs.get("max_token_length", 2048)
         self.min_token_length = kwargs.get("min_token_length", 0)
         self.length_scale_source = kwargs.get("length_scale_source", 1.0)
-        self.start_step = 0
+        self.start_step = start_step
         if self.start_step > 0:
             logging.info(f"Warning, start_step > 0, dataloader start from step: {self.start_step}")
         # super().__init__(dataset, num_replicas=num_replicas, rank=rank,
@@ -146,7 +146,10 @@
         start_idx = self.rank * batches_per_rank
         end_idx = start_idx + batches_per_rank
         rank_batches = buffer_batches[start_idx + self.start_step : end_idx]
-
+        if self.start_step > 0:
+            logging.info(
+                f"Warning, rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num_before: {end_idx-start_idx}, now: {len(rank_batches)}"
+            )
         # Return an iterator over the batches for the current rank
         return iter(rank_batches)
 
diff --git a/funasr/datasets/audio_datasets/index_ds.py b/funasr/datasets/audio_datasets/index_ds.py
index 70581e8..385218a 100644
--- a/funasr/datasets/audio_datasets/index_ds.py
+++ b/funasr/datasets/audio_datasets/index_ds.py
@@ -35,7 +35,7 @@
             with open(path, encoding="utf-8") as fin:
                 file_list_all = fin.readlines()
 
-                num_per_slice = (len(file_list_all) - 1) // data_split_num + 1
+                num_per_slice = (len(file_list_all) - 1) // data_split_num + 1  # 16
                 file_list = file_list_all[
                     data_split_i * num_per_slice : (data_split_i + 1) * num_per_slice
                 ]
diff --git a/funasr/datasets/audio_datasets/update_jsonl.py b/funasr/datasets/audio_datasets/update_jsonl.py
index 6fe377c..561811e 100644
--- a/funasr/datasets/audio_datasets/update_jsonl.py
+++ b/funasr/datasets/audio_datasets/update_jsonl.py
@@ -50,8 +50,8 @@
     sample_num = len(waveform)
     source_len = int(sample_num / 16000 * 1000 / 10)
     source_len_old = data["source_len"]
-    if source_len_old != source_len:
-        print(f"wav: {wav_path}, old: {source_len_old}, new: {source_len}")
+    if (source_len_old - source_len) > 100 or (source_len - source_len_old) > 100:
+        print(f"old: {source_len_old}, new: {source_len}, wav: {wav_path}")
     data["source_len"] = source_len
     jsonl_line = json.dumps(data, ensure_ascii=False)
     lines[i] = jsonl_line

--
Gitblit v1.9.1