| | |
| | | #!/usr/bin/env python |
| | | |
| | | |
| | | def filter(data, |
| | | speech_length_min=100, |
| | | speech_length_max=15000, |
| | | token_length_min=0, |
| | | token_length_max=200): |
| | | assert "speech" in data |
| | | assert "text" in data |
| | | def filter( |
| | | data, speech_length_min=100, speech_length_max=15000, token_length_min=0, token_length_max=200 |
| | | ): |
| | | assert "speech" in data or "text" in data |
| | | |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000. |
| | | if "speech" in data and "text" in data: |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | num_tokens = len(data["text"]) |
| | | return ( |
| | | speech_length_min < speech_length < speech_length_max |
| | | and token_length_min < num_tokens < token_length_max |
| | | ) |
| | | elif "speech" in data: |
| | | if "sampling_rate" in data: |
| | | speech_length = (data["speech"].shape[0] / data["sampling_rate"]) * 1000.0 |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | return speech_length_min < speech_length < speech_length_max |
| | | else: |
| | | speech_length = data["speech"].shape[0] |
| | | num_tokens = len(data['text']) |
| | | |
| | | return speech_length_min < speech_length < speech_length_max and token_length_min < num_tokens < token_length_max |
| | | num_tokens = len(data["text"]) |
| | | return token_length_min < num_tokens < token_length_max |