From 1596f6f414f6f41da66506debb1dff19fffeb3ec Mon Sep 17 00:00:00 2001 From: 游雁 <zhifu.gzf@alibaba-inc.com> Date: 星期一, 24 六月 2024 11:55:17 +0800 Subject: [PATCH] fixbug hotwords --- funasr/datasets/large_datasets/utils/hotword_utils.py | 71 ++++++++++++++++++++--------------- 1 files changed, 40 insertions(+), 31 deletions(-) diff --git a/funasr/datasets/large_datasets/utils/hotword_utils.py b/funasr/datasets/large_datasets/utils/hotword_utils.py index 73f8bdd..66c131e 100644 --- a/funasr/datasets/large_datasets/utils/hotword_utils.py +++ b/funasr/datasets/large_datasets/utils/hotword_utils.py @@ -1,33 +1,42 @@ import random -def sample_hotword(length, - hotword_min_length, - hotword_max_length, - sample_rate, - double_rate, - pre_prob, - pre_index=None, - pre_hwlist=None): - if length < hotword_min_length: - return [-1] - if random.random() < sample_rate: - if pre_prob > 0 and random.random() < pre_prob and pre_index is not None: - return pre_index - if length == hotword_min_length: - return [0, length-1] - elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2: - # sample two hotwords in a sentence - _max_hw_length = min(hotword_max_length, length // 2) - # first hotword - start1 = random.randint(0, length // 3) - end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1) - # second hotword - start2 = random.randint(end1 + 1, length - hotword_min_length) - end2 = random.randint(min(length-1, start2+hotword_min_length-1), min(length-1, start2+hotword_max_length-1)) - return [start1, end1, start2, end2] - else: # single hotword - start = random.randint(0, length - hotword_min_length) - end = random.randint(min(length-1, start+hotword_min_length-1), min(length-1, start+hotword_max_length-1)) - return [start, end] - else: - return [-1] \ No newline at end of file + +def sample_hotword( + length, + hotword_min_length, + hotword_max_length, + sample_rate, + double_rate, + pre_prob, + pre_index=None, + pre_hwlist=None, +): + if length < hotword_min_length: + return [-1] + if random.random() < sample_rate: + if pre_prob > 0 and random.random() < pre_prob and pre_index is not None: + return pre_index + if length == hotword_min_length: + return [0, length - 1] + elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2: + # sample two hotwords in a sentence + _max_hw_length = min(hotword_max_length, length // 2) + # first hotword + start1 = random.randint(0, length // 3) + end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1) + # second hotword + start2 = random.randint(end1 + 1, length - hotword_min_length) + end2 = random.randint( + min(length - 1, start2 + hotword_min_length - 1), + min(length - 1, start2 + hotword_max_length - 1), + ) + return [start1, end1, start2, end2] + else: # single hotword + start = random.randint(0, length - hotword_min_length) + end = random.randint( + min(length - 1, start + hotword_min_length - 1), + min(length - 1, start + hotword_max_length - 1), + ) + return [start, end] + else: + return [-1] -- Gitblit v1.9.1