From 4137f5cf26e7c4b40853959cd2574edfde03aa60 Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期五, 07 四月 2023 21:03:34 +0800
Subject: [PATCH] Merge branch 'main' of github.com:alibaba-damo-academy/FunASR into dev_dzh
---
funasr/datasets/large_datasets/dataset.py | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 1942371..b0e1b8f 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -158,6 +158,7 @@
dict,
seg_dict,
punc_dict,
+ bpe_tokenizer,
conf,
frontend_conf,
mode="train",
@@ -173,7 +174,7 @@
dataset = FilterIterDataPipe(dataset, fn=filter_fn)
if "text" in data_names:
- vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict}
+ vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer}
tokenize_fn = partial(tokenize, **vocab)
dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
--
Gitblit v1.9.1