From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
funasr/datasets/openai_datasets/index_ds.py | 31 ++++++++++++++++++++++++++-----
1 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/funasr/datasets/openai_datasets/index_ds.py b/funasr/datasets/openai_datasets/index_ds.py
index 1c48cd2..010d2d5 100644
--- a/funasr/datasets/openai_datasets/index_ds.py
+++ b/funasr/datasets/openai_datasets/index_ds.py
@@ -15,7 +15,8 @@
def __init__(self, path: str, **kwargs):
super().__init__()
- self.max_source_length = kwargs.get("max_source_length", 2048)
+
+ self.max_source_length = kwargs.get("max_source_length", 3000)
self.min_source_length = kwargs.get("min_source_length", 0)
self.max_target_length = kwargs.get("max_target_length", 2048)
self.min_target_length = kwargs.get("min_target_length", 0)
@@ -48,7 +49,19 @@
for file_json in file_list:
with open(file_json.strip(), encoding="utf-8") as fin:
for line in fin:
- data = json.loads(line.strip())["messages"]
+ data_dict = json.loads(line.strip())
+ data = data_dict["messages"]
+ speech_length = data_dict.get("speech_length", -1) // 8
+ text_length = data_dict.get("text_length", 0)
+ if speech_length > self.max_source_length:
+ logging.info(
+ "speech_length: {speech_length} > {self.max_source_length}, drop it"
+ )
+ continue
+ if text_length > self.max_target_length:
+ continue
+
+ self.max_target_length = kwargs.get("max_target_length", 2048)
system, user, assistant = [], [], []
for i, item in enumerate(data):
@@ -63,7 +76,12 @@
system = system * len(user)
- contents_i = {"system": system, "user": user, "assistant": assistant}
+ contents_i = {
+ "system": system,
+ "user": user,
+ "assistant": assistant,
+ "source_len": speech_length + text_length,
+ }
contents.append(contents_i)
self.contents = contents
@@ -80,11 +98,14 @@
return data
def get_source_len(self, data_dict):
- return len(data_dict["system"]) + len(data_dict["user"])
+ source_len = data_dict.get("source_len", -1)
+ if source_len < 0:
+ source_len = len(data_dict["system"]) + len(data_dict["user"])
+ return source_len
def get_target_len(self, data_dict):
- return len(data_dict["assistant"])
+ return 0
if __name__ == "__main__":
--
Gitblit v1.9.1