shixian.shi
2023-11-24 72fecc8e038070affbf223f8965871e8a1c8c001
funasr/bin/asr_inference_launch.py
@@ -956,7 +956,8 @@
                ed = int(vadsegment[1]) / 1000
                vad_segments.append(
                    [st, ed, audio[int(st * 16000):int(ed * 16000)]])
            check_audio_list(vad_segments)
            audio_dur = check_audio_list(vad_segments)
            if audio_dur > 5:
            # sv pipeline
            segments = sv_chunk(vad_segments)
            embeddings = []
@@ -974,6 +975,10 @@
            embeddings = np.concatenate(embeddings)
            labels = cb_model(embeddings)
            sv_output = postprocess(segments, vad_segments, labels, embeddings)
            else:
                # fake speaker res for too shot utterance
                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
            speech, speech_lengths = batch["speech"], batch["speech_lengths"]