From 9ba0dbd98bf69c830dfcfde8f109a400cb65e4e5 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期五, 29 三月 2024 17:24:59 +0800
Subject: [PATCH] fix func Forward
---
funasr/datasets/audio_datasets/scp2jsonl.py | 22 ++++++++++++----------
1 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/funasr/datasets/audio_datasets/scp2jsonl.py b/funasr/datasets/audio_datasets/scp2jsonl.py
index c60c6f5..e09a84a 100644
--- a/funasr/datasets/audio_datasets/scp2jsonl.py
+++ b/funasr/datasets/audio_datasets/scp2jsonl.py
@@ -19,7 +19,7 @@
world_size = 1
cpu_cores = os.cpu_count() or 1
-
+ print(f"convert wav.scp text to jsonl, ncpu: {cpu_cores}")
if rank == 0:
json_dict = {}
for data_type, data_file in zip(data_type_list, path):
@@ -65,29 +65,31 @@
sample_num = len(waveform)
context_len = int(sample_num//16000*1000/10)
else:
- context_len = len(line)
+ context_len = len(line.split()) if " " in line else len(line)
res[key] = {data_type: line, f"{data_type}_len": context_len}
return res
@hydra.main(config_name=None, version_base=None)
def main_hydra(cfg: DictConfig):
- """
- python funasr/datasets/audio_datasets/scp2jsonl.py \
- ++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
- ++data_type_list='["source", "target"]' \
- ++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
-
- """
-
+
kwargs = OmegaConf.to_container(cfg, resolve=True)
scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"))
+ if isinstance(scp_file_list, str):
+ scp_file_list = eval(scp_file_list)
data_type_list = kwargs.get("data_type_list", ("source", "target"))
jsonl_file_out = kwargs.get("jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl")
gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out)
+"""
+python -m funasr.datasets.audio_datasets.scp2jsonl \
+++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \
+++data_type_list='["source", "target"]' \
+++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl
+"""
+
if __name__ == "__main__":
main_hydra()
--
Gitblit v1.9.1