From cbbb3007436bf7cefa45af9a5aad60c696da9732 Mon Sep 17 00:00:00 2001
From: Lizerui9926 <110582652+Lizerui9926@users.noreply.github.com>
Date: 星期一, 03 四月 2023 20:01:28 +0800
Subject: [PATCH] Merge pull request #324 from alibaba-damo-academy/dev_lhn2

---
 funasr/datasets/large_datasets/dataset.py |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 1942371..b0e1b8f 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -158,6 +158,7 @@
             dict,
             seg_dict,
             punc_dict,
+            bpe_tokenizer,
             conf,
             frontend_conf,
             mode="train",
@@ -173,7 +174,7 @@
     dataset = FilterIterDataPipe(dataset, fn=filter_fn)
 
     if "text" in data_names:
-        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict}
+        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer}
         tokenize_fn = partial(tokenize, **vocab)
         dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
 

--
Gitblit v1.9.1