update asr_spk inference for shot utt
| | |
| | | ed = int(vadsegment[1]) / 1000 |
| | | vad_segments.append( |
| | | [st, ed, audio[int(st * 16000):int(ed * 16000)]]) |
| | | check_audio_list(vad_segments) |
| | | audio_dur = check_audio_list(vad_segments) |
| | | if audio_dur > 5: |
| | | # sv pipeline |
| | | segments = sv_chunk(vad_segments) |
| | | embeddings = [] |
| | |
| | | embeddings = np.concatenate(embeddings) |
| | | labels = cb_model(embeddings) |
| | | sv_output = postprocess(segments, vad_segments, labels, embeddings) |
| | | else: |
| | | # fake speaker res for too shot utterance |
| | | sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]] |
| | | logging.warning("Too short utterence found: {}, return default speaker results.".format(keys)) |
| | | |
| | | speech, speech_lengths = batch["speech"], batch["speech_lengths"] |
| | | |
| | |
| | | assert seg[0] >= audio[ |
| | | i - 1][1], 'modelscope error: Wrong time stamps.' |
| | | audio_dur += seg[1] - seg[0] |
| | | assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.' |
| | | return audio_dur |
| | | # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.' |
| | | |
| | | |
| | | def sv_preprocess(inputs: Union[np.ndarray, list]): |