From 93a02feda1848531a48a1794a8de12a962c84a8e Mon Sep 17 00:00:00 2001 From: Xingchen Song(宋星辰) <xingchensong1996@163.com> Date: 星期五, 07 六月 2024 23:10:13 +0800 Subject: [PATCH] [fix] fix empty asr result (#1794) --- funasr/datasets/large_datasets/utils/filter.py | 27 +++++++++++++++++++++++++++ 1 files changed, 27 insertions(+), 0 deletions(-) diff --git a/funasr/datasets/large_datasets/utils/filter.py b/funasr/datasets/large_datasets/utils/filter.py new file mode 100644 index 0000000..adc8fa0 --- /dev/null +++ b/funasr/datasets/large_datasets/utils/filter.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python + + +def filter( + data, speech_length_min=100, speech_length_max=15000, token_length_min=0, token_length_max=200 +): + assert "speech" in data or "text" in data + + if "speech" in data and "text" in data: + if "sampling_rate" in data: + speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 + else: + speech_length = data["speech"].shape[0] + num_tokens = len(data["text"]) + return ( + speech_length_min < speech_length < speech_length_max + and token_length_min < num_tokens < token_length_max + ) + elif "speech" in data: + if "sampling_rate" in data: + speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 + else: + speech_length = data["speech"].shape[0] + return speech_length_min < speech_length < speech_length_max + else: + num_tokens = len(data["text"]) + return token_length_min < num_tokens < token_length_max -- Gitblit v1.9.1