From 167bab54bbcc0e2b0143e0c2fedce06ee8326ad5 Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期五, 26 五月 2023 11:51:07 +0800
Subject: [PATCH] update repo
---
funasr/datasets/large_datasets/dataset.py | 6 ++++++
egs/wenetspeech/conformer/conf/train_asr_conformer.yaml | 2 +-
2 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
index 9842fa4..a9658b8 100644
--- a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
+++ b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
@@ -90,7 +90,7 @@
dataset_conf:
data_names: speech,text
- data_types: sound,text
+ data_types: sound,text_nospace
shuffle: True
shuffle_conf:
shuffle_size: 2048
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 5df61fd..68b63e1 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -148,6 +148,12 @@
if "key" not in sample_dict:
sample_dict["key"] = segs[0]
sample_dict['hw_tag'] = 1
+ elif data_type == "text_nospace":
+ text = item
+ segs = text.strip().split(maxsplit=1)
+ sample_dict[data_name] = [x for x in segs[1]]
+ if "key" not in sample_dict:
+ sample_dict["key"] = segs[0]
else:
text = item
segs = text.strip().split()
--
Gitblit v1.9.1