| | |
| | | from typing import Any, Dict, List, Union |
| | | |
| | | import numpy as np |
| | | import soundfile as sf |
| | | import librosa as sf |
| | | import torch |
| | | import torchaudio |
| | | import logging |
| | |
| | | assert seg[0] >= audio[ |
| | | i - 1][1], 'modelscope error: Wrong time stamps.' |
| | | audio_dur += seg[1] - seg[0] |
| | | assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.' |
| | | return audio_dur |
| | | # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.' |
| | | |
| | | |
| | | def sv_preprocess(inputs: Union[np.ndarray, list]): |
| | |
| | | for i in range(len(inputs)): |
| | | if isinstance(inputs[i], str): |
| | | file_bytes = File.read(inputs[i]) |
| | | data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') |
| | | data, fs = sf.load(io.BytesIO(file_bytes), dtype='float32') |
| | | if len(data.shape) == 2: |
| | | data = data[:, 0] |
| | | data = torch.from_numpy(data).unsqueeze(0) |