From 4bbc661aa58798dbb9df7b7b548704bc5c619590 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期四, 04 五月 2023 17:24:15 +0800
Subject: [PATCH] update

---
 funasr/datasets/large_datasets/dataset.py        |    3 ++-
 funasr/datasets/large_datasets/utils/tokenize.py |    1 +
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index db770f5..53994cb 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -181,7 +181,8 @@
     hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
                  "double_rate": conf.get("double_rate", 0.1),
                  "hotword_min_length": conf.get("hotword_min_length", 2),
-                 "hotword_max_length": conf.get("hotword_max_length", 8)}
+                 "hotword_max_length": conf.get("hotword_max_length", 8),
+                 "pre_prob": conf.get("pre_prob", 0.0)}
 
     if pre_hwfile is not None:
         pre_hwlist = []
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index 09ece76..f8833b1 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -58,6 +58,7 @@
     if 'hw_tag' in data:
         hotword_indxs = sample_hotword(length, **hw_config)
         data[hotword_indxs] = hotword_indxs
+        del data['hw_tag']
     for i in range(length):
         x = text[i]
         if i == length-1 and "punc" in data and x.startswith("vad:"):

--
Gitblit v1.9.1