From 73613cefc97bd43699d10b8d162c69b2c4544ad5 Mon Sep 17 00:00:00 2001
From: 夜雨飘零 <yeyupiaoling@foxmail.com>
Date: 星期一, 04 十二月 2023 21:41:07 +0800
Subject: [PATCH] 增加分角色语音识别对ERes2Net模型的支持。
---
funasr/bin/asr_inference_launch.py | 30 +++++++++++++++++++++---------
1 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index f61c085..402a911 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -48,13 +48,13 @@
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
from funasr.utils.vad_utils import slice_padding_fbank
-from funasr.utils.speaker_utils import (check_audio_list,
- sv_preprocess,
- sv_chunk,
- CAMPPlus,
- extract_feature,
+from funasr.utils.speaker_utils import (check_audio_list,
+ sv_preprocess,
+ sv_chunk,
+ CAMPPlus,
+ extract_feature,
postprocess,
- distribute_spk)
+ distribute_spk, ERes2Net)
from funasr.build_utils.build_model_from_file import build_model_from_file
from funasr.utils.cluster_backend import ClusterBackend
from funasr.utils.modelscope_utils import get_cache_dir
@@ -819,6 +819,10 @@
)
sv_model_file = asr_model_file.replace("model.pb", "campplus_cn_common.bin")
+ if not os.path.exists(sv_model_file):
+ sv_model_file = asr_model_file.replace("model.pb", "pretrained_eres2net_aug.ckpt")
+ if not os.path.exists(sv_model_file):
+ raise FileNotFoundError("sv_model_file not found: {}".format(sv_model_file))
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
@@ -944,8 +948,14 @@
##### speaker_verification #####
##################################
# load sv model
- sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu'))
- sv_model = CAMPPlus()
+ sv_model_dict = torch.load(sv_model_file)
+ print(f'load sv model params: {sv_model_file}')
+ if os.path.basename(sv_model_file) == "campplus_cn_common.bin":
+ sv_model = CAMPPlus()
+ else:
+ sv_model = ERes2Net()
+ if ngpu > 0:
+ sv_model.cuda()
sv_model.load_state_dict(sv_model_dict)
sv_model.eval()
cb_model = ClusterBackend()
@@ -969,9 +979,11 @@
embs = []
for x in wavs:
x = extract_feature([x])
+ if ngpu > 0:
+ x = x.cuda()
embs.append(sv_model(x))
embs = torch.cat(embs)
- embeddings.append(embs.detach().numpy())
+ embeddings.append(embs.cpu().detach().numpy())
embeddings = np.concatenate(embeddings)
labels = cb_model(embeddings)
sv_output = postprocess(segments, vad_segments, labels, embeddings)
--
Gitblit v1.9.1