From 72fecc8e038070affbf223f8965871e8a1c8c001 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期五, 24 十一月 2023 14:29:33 +0800
Subject: [PATCH] update asr_spk inference for shot utt
---
funasr/bin/asr_inference_launch.py | 41 +++++++++++++++++++++++------------------
1 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 7dd27fc..f61c085 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -956,24 +956,29 @@
ed = int(vadsegment[1]) / 1000
vad_segments.append(
[st, ed, audio[int(st * 16000):int(ed * 16000)]])
- check_audio_list(vad_segments)
- # sv pipeline
- segments = sv_chunk(vad_segments)
- embeddings = []
- for s in segments:
- #_, embs = self.sv_pipeline([s[2]], output_emb=True)
- # embeddings.append(embs)
- wavs = sv_preprocess([s[2]])
- # embs = self.forward(wavs)
- embs = []
- for x in wavs:
- x = extract_feature([x])
- embs.append(sv_model(x))
- embs = torch.cat(embs)
- embeddings.append(embs.detach().numpy())
- embeddings = np.concatenate(embeddings)
- labels = cb_model(embeddings)
- sv_output = postprocess(segments, vad_segments, labels, embeddings)
+ audio_dur = check_audio_list(vad_segments)
+ if audio_dur > 5:
+ # sv pipeline
+ segments = sv_chunk(vad_segments)
+ embeddings = []
+ for s in segments:
+ #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+ # embeddings.append(embs)
+ wavs = sv_preprocess([s[2]])
+ # embs = self.forward(wavs)
+ embs = []
+ for x in wavs:
+ x = extract_feature([x])
+ embs.append(sv_model(x))
+ embs = torch.cat(embs)
+ embeddings.append(embs.detach().numpy())
+ embeddings = np.concatenate(embeddings)
+ labels = cb_model(embeddings)
+ sv_output = postprocess(segments, vad_segments, labels, embeddings)
+ else:
+ # fake speaker res for too shot utterance
+ sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+ logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
speech, speech_lengths = batch["speech"], batch["speech_lengths"]
--
Gitblit v1.9.1