From eaf9dda9e4d970af3d09db695e9e10c83ef94e25 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 17 四月 2024 15:05:37 +0800
Subject: [PATCH] Dev gzf exp (#1624)

---
 funasr/datasets/audio_datasets/index_ds.py |   23 +++++++++++++++--------
 1 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/funasr/datasets/audio_datasets/index_ds.py b/funasr/datasets/audio_datasets/index_ds.py
index 34f7b4f..5396c8a 100644
--- a/funasr/datasets/audio_datasets/index_ds.py
+++ b/funasr/datasets/audio_datasets/index_ds.py
@@ -92,7 +92,7 @@
             for line in fin:
                 data = json.loads(line.strip())
                 if "text" in data:  # for sft
-                    self.contents.append(data['text'])
+                    contents.append(data['text'])
                 if "source" in data:  # for speech lab pretrain
                     prompt = data.get("prompt", "<ASR>")
                     source = data["source"]
@@ -101,13 +101,20 @@
                     target_len = data.get("target_len", 0)
                     if "aishell" in source:
                         target = target.replace(" ", "")
-                    contents.append({"source": source,
-                                     "prompt": prompt,
-                                     "target": target,
-                                     "source_len": source_len,
-                                     "target_len": target_len,
-                                     }
-                                    )
+
+                    contents_i = {"source": source,
+                                 "prompt": prompt,
+                                 "target": target,
+                                 "source_len": source_len,
+                                 "target_len": target_len,
+                                 }
+                    text_language = data.get("text_language", None)
+                    if text_language is not None:
+                        contents_i["text_language"] = text_language
+                    audio_language = data.get("audio_language", None)
+                    if audio_language is not None:
+                        contents_i["audio_language"] = audio_language
+                    contents.append(contents_i)
 
         self.contents = contents
         

--
Gitblit v1.9.1