From 8a08405b668e06c4670b4c13f6793e193f21a21d Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期一, 08 五月 2023 11:43:08 +0800
Subject: [PATCH] Merge branch 'main' into dev_apis

---
 funasr/datasets/large_datasets/utils/tokenize.py |    8 +++++++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index 0d2fd84..f0f0c66 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import re
 import numpy as np
+from funasr.datasets.large_datasets.utils.hotword_utils import sample_hotword
 
 def forward_segment(text, seg_dict):
     word_list = []
@@ -38,7 +39,8 @@
              vocab=None,
              seg_dict=None,
              punc_dict=None,
-             bpe_tokenizer=None):
+             bpe_tokenizer=None,
+             hw_config=None):
     assert "text" in data
     assert isinstance(vocab, dict)
     text = data["text"]
@@ -53,6 +55,10 @@
         text = seg_tokenize(text, seg_dict)
 
     length = len(text)
+    if 'hw_tag' in data:
+        hotword_indxs = sample_hotword(length, **hw_config)
+        data['hotword_indxs'] = hotword_indxs
+        del data['hw_tag']
     for i in range(length):
         x = text[i]
         if i == length-1 and "punc" in data and x.startswith("vad:"):

--
Gitblit v1.9.1