From 01df8f330ccc754223d5e2d688dc0a55d27f2dcc Mon Sep 17 00:00:00 2001
From: querryton <72929808+querryton@users.noreply.github.com>
Date: 星期六, 20 四月 2024 16:07:13 +0800
Subject: [PATCH] [fix] Fix a bug in seaco_paraformer model "inference" function (#1639)

---
 funasr/datasets/audio_datasets/index_ds.py |   49 ++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/funasr/datasets/audio_datasets/index_ds.py b/funasr/datasets/audio_datasets/index_ds.py
index 008b08f..53419e8 100644
--- a/funasr/datasets/audio_datasets/index_ds.py
+++ b/funasr/datasets/audio_datasets/index_ds.py
@@ -1,6 +1,9 @@
+import os
 import json
 import torch
 import logging
+import concurrent.futures
+import librosa
 import torch.distributed as dist
 
 from funasr.register import tables
@@ -71,29 +74,53 @@
 @tables.register("index_ds_classes", "IndexDSJsonlRankFull")
 class IndexDSJsonlRankFull(torch.utils.data.Dataset):
     
-    def __init__(self, path):
+    def __init__(self, path: str, **kwargs):
         super().__init__()
-        
+        self.max_source_length = kwargs.get("max_source_length", 2048)
+        self.min_source_length = kwargs.get("min_source_length", 0)
+        self.max_target_length = kwargs.get("max_target_length", 2048)
+        self.min_target_length = kwargs.get("min_target_length", 0)
+        if isinstance(path, (list, tuple)): # wav.scp, text.txt/text.trans
+            from funasr.datasets.audio_datasets.scp2jsonl import gen_jsonl_from_wav_text_list
+            jsonl_outdir = os.path.dirname(path[0])
+            jsonl_name = "datalist_train.jsonl" if kwargs.get("is_training", True) else "datalist_val.jsonl"
+            jsonl_file_out = os.path.join(jsonl_outdir, jsonl_name)
+            if not os.path.exists(jsonl_file_out):
+                print(f"datalist is: {path}, generate jsonl from it")
+                gen_jsonl_from_wav_text_list(path, jsonl_file_out=jsonl_file_out, **kwargs)
+            path = jsonl_file_out
+
         contents = []
         with open(path, encoding='utf-8') as fin:
             for line in fin:
                 data = json.loads(line.strip())
                 if "text" in data:  # for sft
-                    self.contents.append(data['text'])
+                    contents.append(data['text'])
                 if "source" in data:  # for speech lab pretrain
                     prompt = data.get("prompt", "<ASR>")
                     source = data["source"]
                     target = data["target"]
                     source_len = data.get("source_len", 1)
                     target_len = data.get("target_len", 0)
-                    
-                    contents.append({"source": source,
-                                     "prompt": prompt,
-                                     "target": target,
-                                     "source_len": source_len,
-                                     "target_len": target_len,
-                                     }
-                                    )
+                    if "aishell" in source:
+                        target = target.replace(" ", "")
+                    if source_len < self.min_source_length or source_len > self.max_source_length:
+                        continue
+                    if target_len < self.min_target_length or target_len > self.max_target_length:
+                        continue
+                    contents_i = {"source": source,
+                                 "prompt": prompt,
+                                 "target": target,
+                                 "source_len": source_len,
+                                 "target_len": target_len,
+                                 }
+                    text_language = data.get("text_language", None)
+                    if text_language is not None:
+                        contents_i["text_language"] = text_language
+                    audio_language = data.get("audio_language", None)
+                    if audio_language is not None:
+                        contents_i["audio_language"] = audio_language
+                    contents.append(contents_i)
 
         self.contents = contents
         

--
Gitblit v1.9.1