shixian.shi
2023-11-24 72fecc8e038070affbf223f8965871e8a1c8c001
update asr_spk inference for shot utt
2个文件已修改
44 ■■■■■ 已修改文件
funasr/bin/asr_inference_launch.py 41 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/utils/speaker_utils.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_launch.py
@@ -956,24 +956,29 @@
                ed = int(vadsegment[1]) / 1000
                vad_segments.append(
                    [st, ed, audio[int(st * 16000):int(ed * 16000)]])
            check_audio_list(vad_segments)
            # sv pipeline
            segments = sv_chunk(vad_segments)
            embeddings = []
            for s in segments:
                #_, embs = self.sv_pipeline([s[2]], output_emb=True)
                # embeddings.append(embs)
                wavs = sv_preprocess([s[2]])
                # embs = self.forward(wavs)
                embs = []
                for x in wavs:
                    x = extract_feature([x])
                    embs.append(sv_model(x))
                embs = torch.cat(embs)
                embeddings.append(embs.detach().numpy())
            embeddings = np.concatenate(embeddings)
            labels = cb_model(embeddings)
            sv_output = postprocess(segments, vad_segments, labels, embeddings)
            audio_dur = check_audio_list(vad_segments)
            if audio_dur > 5:
                # sv pipeline
                segments = sv_chunk(vad_segments)
                embeddings = []
                for s in segments:
                    #_, embs = self.sv_pipeline([s[2]], output_emb=True)
                    # embeddings.append(embs)
                    wavs = sv_preprocess([s[2]])
                    # embs = self.forward(wavs)
                    embs = []
                    for x in wavs:
                        x = extract_feature([x])
                        embs.append(sv_model(x))
                    embs = torch.cat(embs)
                    embeddings.append(embs.detach().numpy())
                embeddings = np.concatenate(embeddings)
                labels = cb_model(embeddings)
                sv_output = postprocess(segments, vad_segments, labels, embeddings)
            else:
                # fake speaker res for too shot utterance
                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
            speech, speech_lengths = batch["speech"], batch["speech_lengths"]
funasr/utils/speaker_utils.py
@@ -35,7 +35,8 @@
            assert seg[0] >= audio[
                i - 1][1], 'modelscope error: Wrong time stamps.'
        audio_dur += seg[1] - seg[0]
    assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
    return audio_dur
    # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
def sv_preprocess(inputs: Union[np.ndarray, list]):