From 73613cefc97bd43699d10b8d162c69b2c4544ad5 Mon Sep 17 00:00:00 2001
From: 夜雨飘零 <yeyupiaoling@foxmail.com>
Date: 星期一, 04 十二月 2023 21:41:07 +0800
Subject: [PATCH] 增加分角色语音识别对ERes2Net模型的支持。

---
 funasr/bin/asr_inference_launch.py |   75 +++++++++++++++++++++++--------------
 1 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index e1a32c5..402a911 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -20,7 +20,8 @@
 import numpy as np
 import torch
 import torchaudio
-import soundfile
+# import librosa
+import librosa
 import yaml
 
 from funasr.bin.asr_infer import Speech2Text
@@ -47,13 +48,13 @@
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_none
 from funasr.utils.vad_utils import slice_padding_fbank
-from funasr.utils.speaker_utils import (check_audio_list, 
-                                        sv_preprocess, 
-                                        sv_chunk, 
-                                        CAMPPlus, 
-                                        extract_feature, 
+from funasr.utils.speaker_utils import (check_audio_list,
+                                        sv_preprocess,
+                                        sv_chunk,
+                                        CAMPPlus,
+                                        extract_feature,
                                         postprocess,
-                                        distribute_spk)
+                                        distribute_spk, ERes2Net)
 from funasr.build_utils.build_model_from_file import build_model_from_file
 from funasr.utils.cluster_backend import ClusterBackend
 from funasr.utils.modelscope_utils import get_cache_dir
@@ -818,6 +819,10 @@
     )
 
     sv_model_file = asr_model_file.replace("model.pb", "campplus_cn_common.bin")
+    if not os.path.exists(sv_model_file):
+        sv_model_file = asr_model_file.replace("model.pb", "pretrained_eres2net_aug.ckpt")
+        if not os.path.exists(sv_model_file):
+            raise FileNotFoundError("sv_model_file not found: {}".format(sv_model_file))
 
     if param_dict is not None:
         hotword_list_or_file = param_dict.get('hotword')
@@ -943,8 +948,14 @@
             #####  speaker_verification  #####
             ##################################
             # load sv model
-            sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu'))
-            sv_model = CAMPPlus()
+            sv_model_dict = torch.load(sv_model_file)
+            print(f'load sv model params: {sv_model_file}')
+            if os.path.basename(sv_model_file) == "campplus_cn_common.bin":
+                sv_model = CAMPPlus()
+            else:
+                sv_model = ERes2Net()
+            if ngpu > 0:
+                sv_model.cuda()
             sv_model.load_state_dict(sv_model_dict)
             sv_model.eval()
             cb_model = ClusterBackend()
@@ -955,24 +966,31 @@
                 ed = int(vadsegment[1]) / 1000
                 vad_segments.append(
                     [st, ed, audio[int(st * 16000):int(ed * 16000)]])
-            check_audio_list(vad_segments)
-            # sv pipeline
-            segments = sv_chunk(vad_segments)
-            embeddings = []
-            for s in segments:
-                #_, embs = self.sv_pipeline([s[2]], output_emb=True)
-                # embeddings.append(embs)
-                wavs = sv_preprocess([s[2]])
-                # embs = self.forward(wavs)
-                embs = []
-                for x in wavs:
-                    x = extract_feature([x])
-                    embs.append(sv_model(x))
-                embs = torch.cat(embs)
-                embeddings.append(embs.detach().numpy())
-            embeddings = np.concatenate(embeddings)
-            labels = cb_model(embeddings)
-            sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            audio_dur = check_audio_list(vad_segments)
+            if audio_dur > 5:
+                # sv pipeline
+                segments = sv_chunk(vad_segments)
+                embeddings = []
+                for s in segments:
+                    #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+                    # embeddings.append(embs)
+                    wavs = sv_preprocess([s[2]])
+                    # embs = self.forward(wavs)
+                    embs = []
+                    for x in wavs:
+                        x = extract_feature([x])
+                        if ngpu > 0:
+                            x = x.cuda()
+                        embs.append(sv_model(x))
+                    embs = torch.cat(embs)
+                    embeddings.append(embs.cpu().detach().numpy())
+                embeddings = np.concatenate(embeddings)
+                labels = cb_model(embeddings)
+                sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            else:
+                # fake speaker res for too shot utterance
+                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
 
             speech, speech_lengths = batch["speech"], batch["speech_lengths"]
 
@@ -1281,7 +1299,8 @@
             try:
                 raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
             except:
-                raw_inputs = soundfile.read(data_path_and_name_and_type[0], dtype='float32')[0]
+                # raw_inputs = librosa.load(data_path_and_name_and_type[0], dtype='float32')[0]
+                raw_inputs, sr = librosa.load(data_path_and_name_and_type[0], dtype='float32')
                 if raw_inputs.ndim == 2:
                     raw_inputs = raw_inputs[:, 0]
                 raw_inputs = torch.tensor(raw_inputs)

--
Gitblit v1.9.1