zhifu gao
2024-04-25 80bd14e6bbb7bb282ff3832194648dc4a16157ca
funasr/datasets/audio_datasets/index_ds.py
@@ -21,6 +21,7 @@
        self.min_source_length = kwargs.get("min_source_length", 0)
        self.max_target_length = kwargs.get("max_target_length", 2048)
        self.min_target_length = kwargs.get("min_target_length", 0)
        self.max_token_length = kwargs.get("max_token_length", 2200)
        is_training = kwargs.get("is_training", True)
        if not (path.endswith(".jsonl") or path.endswith(".json")):
@@ -103,6 +104,10 @@
                            or target_len > self.max_target_length
                        ):
                            continue
                        if (source_len + target_len) > self.max_token_length:
                            continue
                        contents_i = {
                            "source": source,
                            "prompt": prompt,