From 72fecc8e038070affbf223f8965871e8a1c8c001 Mon Sep 17 00:00:00 2001
From: shixian.shi <shixian.shi@alibaba-inc.com>
Date: 星期五, 24 十一月 2023 14:29:33 +0800
Subject: [PATCH] update asr_spk inference for shot utt

---
 funasr/utils/speaker_utils.py      |    3 ++-
 funasr/bin/asr_inference_launch.py |   41 +++++++++++++++++++++++------------------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 7dd27fc..f61c085 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -956,24 +956,29 @@
                 ed = int(vadsegment[1]) / 1000
                 vad_segments.append(
                     [st, ed, audio[int(st * 16000):int(ed * 16000)]])
-            check_audio_list(vad_segments)
-            # sv pipeline
-            segments = sv_chunk(vad_segments)
-            embeddings = []
-            for s in segments:
-                #_, embs = self.sv_pipeline([s[2]], output_emb=True)
-                # embeddings.append(embs)
-                wavs = sv_preprocess([s[2]])
-                # embs = self.forward(wavs)
-                embs = []
-                for x in wavs:
-                    x = extract_feature([x])
-                    embs.append(sv_model(x))
-                embs = torch.cat(embs)
-                embeddings.append(embs.detach().numpy())
-            embeddings = np.concatenate(embeddings)
-            labels = cb_model(embeddings)
-            sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            audio_dur = check_audio_list(vad_segments)
+            if audio_dur > 5:
+                # sv pipeline
+                segments = sv_chunk(vad_segments)
+                embeddings = []
+                for s in segments:
+                    #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+                    # embeddings.append(embs)
+                    wavs = sv_preprocess([s[2]])
+                    # embs = self.forward(wavs)
+                    embs = []
+                    for x in wavs:
+                        x = extract_feature([x])
+                        embs.append(sv_model(x))
+                    embs = torch.cat(embs)
+                    embeddings.append(embs.detach().numpy())
+                embeddings = np.concatenate(embeddings)
+                labels = cb_model(embeddings)
+                sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            else:
+                # fake speaker res for too shot utterance
+                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
 
             speech, speech_lengths = batch["speech"], batch["speech_lengths"]
 
diff --git a/funasr/utils/speaker_utils.py b/funasr/utils/speaker_utils.py
index 38ef11c..edaf58b 100644
--- a/funasr/utils/speaker_utils.py
+++ b/funasr/utils/speaker_utils.py
@@ -35,7 +35,8 @@
             assert seg[0] >= audio[
                 i - 1][1], 'modelscope error: Wrong time stamps.'
         audio_dur += seg[1] - seg[0]
-    assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
+    return audio_dur
+    # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
 
 
 def sv_preprocess(inputs: Union[np.ndarray, list]):

--
Gitblit v1.9.1