From 1596f6f414f6f41da66506debb1dff19fffeb3ec Mon Sep 17 00:00:00 2001 From: 游雁 <zhifu.gzf@alibaba-inc.com> Date: 星期一, 24 六月 2024 11:55:17 +0800 Subject: [PATCH] fixbug hotwords --- funasr/datasets/large_datasets/utils/filter.py | 34 +++++++++++++++++++++++----------- 1 files changed, 23 insertions(+), 11 deletions(-) diff --git a/funasr/datasets/large_datasets/utils/filter.py b/funasr/datasets/large_datasets/utils/filter.py index 5dc911f..adc8fa0 100644 --- a/funasr/datasets/large_datasets/utils/filter.py +++ b/funasr/datasets/large_datasets/utils/filter.py @@ -1,15 +1,27 @@ #!/usr/bin/env python -def filter(data, - min_length=10, - max_length=10000, - min_token_length=0, - max_token_length=200): - assert "speech" in data - assert "text" in data +def filter( + data, speech_length_min=100, speech_length_max=15000, token_length_min=0, token_length_max=200 +): + assert "speech" in data or "text" in data - num_frames = data["speech"].shape[0] - num_tokens = len(data['text']) - - return min_length < num_frames < max_length and min_token_length < num_tokens < max_token_length \ No newline at end of file + if "speech" in data and "text" in data: + if "sampling_rate" in data: + speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 + else: + speech_length = data["speech"].shape[0] + num_tokens = len(data["text"]) + return ( + speech_length_min < speech_length < speech_length_max + and token_length_min < num_tokens < token_length_max + ) + elif "speech" in data: + if "sampling_rate" in data: + speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 + else: + speech_length = data["speech"].shape[0] + return speech_length_min < speech_length < speech_length_max + else: + num_tokens = len(data["text"]) + return token_length_min < num_tokens < token_length_max -- Gitblit v1.9.1