From 80bd14e6bbb7bb282ff3832194648dc4a16157ca Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 25 四月 2024 10:41:14 +0800
Subject: [PATCH] Dev gzf exp (#1657)
---
funasr/datasets/audio_datasets/index_ds.py | 5 +++++
1 files changed, 5 insertions(+), 0 deletions(-)
diff --git a/funasr/datasets/audio_datasets/index_ds.py b/funasr/datasets/audio_datasets/index_ds.py
index d26124b..da008b4 100644
--- a/funasr/datasets/audio_datasets/index_ds.py
+++ b/funasr/datasets/audio_datasets/index_ds.py
@@ -21,6 +21,7 @@
self.min_source_length = kwargs.get("min_source_length", 0)
self.max_target_length = kwargs.get("max_target_length", 2048)
self.min_target_length = kwargs.get("min_target_length", 0)
+ self.max_token_length = kwargs.get("max_token_length", 2200)
is_training = kwargs.get("is_training", True)
if not (path.endswith(".jsonl") or path.endswith(".json")):
@@ -103,6 +104,10 @@
or target_len > self.max_target_length
):
continue
+
+ if (source_len + target_len) > self.max_token_length:
+ continue
+
contents_i = {
"source": source,
"prompt": prompt,
--
Gitblit v1.9.1