python/FunASR-XL.git

parent: 9c0735b7 | 补丁 | 提交 | ignore whitespace

游雁

2024-03-22 fcbbe8af9f22a41611d9506af17cae1e422f9fec

update

24个文件已修改

	examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/aishell/paraformer/conf/paraformer_conformer_12e_6d_2048_256.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/aishell/transformer/conf/transformer_12e_6d_2048_256.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	examples/industrial_data_pretraining/llm_asr/conf/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/compute_audio_cmvn.py	23 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/train.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/audio_datasets/jsonl2scp.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/audio_datasets/scp2jsonl.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/dataloader_entry.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/bicif_paraformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/branchformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/conformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/contextual_paraformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e_branchformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/monotonic_aligner/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/paraformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/paraformer_streaming/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/sanm/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/scama/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/seaco_paraformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/transformer/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/uniasr/template.yaml	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 examples/aishell/branchformer/conf/branchformer_12e_6d_2048_256.yaml

@@ -94,7 +94,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 examples/aishell/conformer/conf/conformer_12e_6d_2048_256.yaml

@@ -94,7 +94,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 examples/aishell/e_branchformer/conf/e_branchformer_12e_6d_2048_256.yaml

@@ -94,7 +94,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 examples/aishell/paraformer/conf/paraformer_conformer_12e_6d_2048_256.yaml

@@ -93,7 +93,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 examples/aishell/transformer/conf/transformer_12e_6d_2048_256.yaml

@@ -88,7 +88,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 examples/industrial_data_pretraining/llm_asr/conf/template.yaml

@@ -73,7 +73,7 @@
dataset: AudioLLMDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: RankFullLocalShuffleBatchSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/bin/compute_audio_cmvn.py

@@ -28,7 +28,7 @@
def main(**kwargs):
    print(kwargs)
    # set random seed
    tables.print()
    # tables.print()
    set_all_random_seed(kwargs.get("seed", 0))
    torch.backends.cudnn.enabled = kwargs.get("cudnn_enabled", torch.backends.cudnn.enabled)
    torch.backends.cudnn.benchmark = kwargs.get("cudnn_benchmark", torch.backends.cudnn.benchmark)
@@ -54,21 +54,14 @@
    dataset_train = dataset_class(kwargs.get("train_data_set_list"), frontend=frontend, tokenizer=None, is_training=False, **kwargs.get("dataset_conf"))

    # dataloader
    batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
    batch_sampler_train = None
    if batch_sampler is not None:
        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
        dataset_conf = kwargs.get("dataset_conf")
        dataset_conf["batch_type"] = "example"
        dataset_conf["batch_size"] = 1
        batch_sampler_train = batch_sampler_class(dataset_train, is_training=False, **dataset_conf)
    batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "BatchSampler")
    batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
    dataset_conf = kwargs.get("dataset_conf")
    dataset_conf["batch_type"] = "example"
    dataset_conf["batch_size"] = 1
    batch_sampler_train = batch_sampler_class(dataset_train, is_training=False, **dataset_conf)

    
    dataloader_train = torch.utils.data.DataLoader(dataset_train,
                                                collate_fn=dataset_train.collator,
                                                batch_sampler=batch_sampler_train,
                                                num_workers=int(kwargs.get("dataset_conf").get("num_workers", 4)),
                                                pin_memory=True)
    dataloader_train = torch.utils.data.DataLoader(dataset_train, collate_fn=dataset_train.collator, **batch_sampler_train)
    
    iter_stop = int(kwargs.get("scale", 1.0)*len(dataloader_train))


 funasr/bin/train.py

@@ -148,7 +148,7 @@

    # dataset
    logging.info("Build dataloader")
    dataloader_class = tables.dataloader_classes.get( kwargs["dataset_conf"].get("dataloader", "DataloaderMapStyle"))
    dataloader_class = tables.dataloader_classes.get(kwargs["dataset_conf"].get("dataloader", "DataloaderMapStyle"))
    dataloader_tr, dataloader_val = dataloader_class(**kwargs)

    trainer = Trainer(local_rank=local_rank,

 funasr/datasets/audio_datasets/jsonl2scp.py

@@ -40,6 +40,7 @@
def main_hydra(cfg: DictConfig):
 
    kwargs = OmegaConf.to_container(cfg, resolve=True)
    print(kwargs)

    scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
    if isinstance(scp_file_list, str):

 funasr/datasets/audio_datasets/scp2jsonl.py

@@ -79,6 +79,7 @@
def main_hydra(cfg: DictConfig):
 
    kwargs = OmegaConf.to_container(cfg, resolve=True)
    print(kwargs)

    scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
    if isinstance(scp_file_list, str):

 funasr/datasets/dataloader_entry.py

@@ -13,7 +13,7 @@
    dataset_val = dataset_class(kwargs.get("valid_data_set_list"), frontend=frontend, tokenizer=tokenizer, is_training=False, **kwargs.get("dataset_conf"))
    
    # dataloader
    batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
    batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "BatchSampler")
    batch_sampler_val = None
    if batch_sampler is not None:
        batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)

 funasr/models/bicif_paraformer/template.yaml

@@ -112,7 +112,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/branchformer/template.yaml

@@ -94,7 +94,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/conformer/template.yaml

@@ -95,7 +95,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/contextual_paraformer/template.yaml

@@ -108,7 +108,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/e_branchformer/template.yaml

@@ -94,7 +94,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/monotonic_aligner/template.yaml

@@ -93,7 +93,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/paraformer/template.yaml

@@ -100,7 +100,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/paraformer_streaming/template.yaml

@@ -121,7 +121,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/sanm/template.yaml

@@ -98,7 +98,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/scama/template.yaml

@@ -104,7 +104,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/seaco_paraformer/template.yaml

@@ -134,7 +134,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/transformer/template.yaml

@@ -88,7 +88,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

 funasr/models/uniasr/template.yaml

@@ -184,7 +184,7 @@
dataset: AudioDataset
dataset_conf:
    index_ds: IndexDSJsonl
    batch_sampler: DynamicBatchLocalShuffleSampler
    batch_sampler: BatchSampler
    batch_type: example # example or length
    batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
    max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -94,7 +94,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: RankFullLocalShuffleBatchSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -93,7 +93,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: RankFullLocalShuffleBatchSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -88,7 +88,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: RankFullLocalShuffleBatchSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 32 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -73,7 +73,7 @@
			dataset: AudioLLMDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: RankFullLocalShuffleBatchSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 8 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -28,7 +28,7 @@
			def main(**kwargs):
			print(kwargs)
			# set random seed
			tables.print()
			# tables.print()
			set_all_random_seed(kwargs.get("seed", 0))
			torch.backends.cudnn.enabled = kwargs.get("cudnn_enabled", torch.backends.cudnn.enabled)
			torch.backends.cudnn.benchmark = kwargs.get("cudnn_benchmark", torch.backends.cudnn.benchmark)
			@@ -54,21 +54,14 @@
			dataset_train = dataset_class(kwargs.get("train_data_set_list"), frontend=frontend, tokenizer=None, is_training=False, **kwargs.get("dataset_conf"))

			# dataloader
			batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
			batch_sampler_train = None
			if batch_sampler is not None:
			batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
			dataset_conf = kwargs.get("dataset_conf")
			dataset_conf["batch_type"] = "example"
			dataset_conf["batch_size"] = 1
			batch_sampler_train = batch_sampler_class(dataset_train, is_training=False, **dataset_conf)
			batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "BatchSampler")
			batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)
			dataset_conf = kwargs.get("dataset_conf")
			dataset_conf["batch_type"] = "example"
			dataset_conf["batch_size"] = 1
			batch_sampler_train = batch_sampler_class(dataset_train, is_training=False, **dataset_conf)


			dataloader_train = torch.utils.data.DataLoader(dataset_train,
			collate_fn=dataset_train.collator,
			batch_sampler=batch_sampler_train,
			num_workers=int(kwargs.get("dataset_conf").get("num_workers", 4)),
			pin_memory=True)
			dataloader_train = torch.utils.data.DataLoader(dataset_train, collate_fn=dataset_train.collator, **batch_sampler_train)

			iter_stop = int(kwargs.get("scale", 1.0)*len(dataloader_train))

			@@ -148,7 +148,7 @@

			# dataset
			logging.info("Build dataloader")
			dataloader_class = tables.dataloader_classes.get( kwargs["dataset_conf"].get("dataloader", "DataloaderMapStyle"))
			dataloader_class = tables.dataloader_classes.get(kwargs["dataset_conf"].get("dataloader", "DataloaderMapStyle"))
			dataloader_tr, dataloader_val = dataloader_class(**kwargs)

			trainer = Trainer(local_rank=local_rank,

			@@ -40,6 +40,7 @@
			def main_hydra(cfg: DictConfig):

			kwargs = OmegaConf.to_container(cfg, resolve=True)
			print(kwargs)

			scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
			if isinstance(scp_file_list, str):

			@@ -79,6 +79,7 @@
			def main_hydra(cfg: DictConfig):

			kwargs = OmegaConf.to_container(cfg, resolve=True)
			print(kwargs)

			scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
			if isinstance(scp_file_list, str):

			@@ -13,7 +13,7 @@
			dataset_val = dataset_class(kwargs.get("valid_data_set_list"), frontend=frontend, tokenizer=tokenizer, is_training=False, **kwargs.get("dataset_conf"))

			# dataloader
			batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "DynamicBatchLocalShuffleSampler")
			batch_sampler = kwargs["dataset_conf"].get("batch_sampler", "BatchSampler")
			batch_sampler_val = None
			if batch_sampler is not None:
			batch_sampler_class = tables.batch_sampler_classes.get(batch_sampler)

			@@ -112,7 +112,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -95,7 +95,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -108,7 +108,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -100,7 +100,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -121,7 +121,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -98,7 +98,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -104,7 +104,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -134,7 +134,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,

			@@ -184,7 +184,7 @@
			dataset: AudioDataset
			dataset_conf:
			index_ds: IndexDSJsonl
			batch_sampler: DynamicBatchLocalShuffleSampler
			batch_sampler: BatchSampler
			batch_type: example # example or length
			batch_size: 1 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len;
			max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length,