From fcaf910ceb4e07a13bf2d133f46df684b069b3f0 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期三, 03 一月 2024 11:45:22 +0800
Subject: [PATCH] update online docs
---
funasr/bin/asr_inference_launch.py | 73 +++++++++++++++++++++++-------------
1 files changed, 47 insertions(+), 26 deletions(-)
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 7dd27fc..f34bfb2 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -48,13 +48,13 @@
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
from funasr.utils.vad_utils import slice_padding_fbank
-from funasr.utils.speaker_utils import (check_audio_list,
- sv_preprocess,
- sv_chunk,
- CAMPPlus,
- extract_feature,
+from funasr.utils.speaker_utils import (check_audio_list,
+ sv_preprocess,
+ sv_chunk,
+ extract_feature,
postprocess,
distribute_spk)
+import funasr.modules.cnn as sv_module
from funasr.build_utils.build_model_from_file import build_model_from_file
from funasr.utils.cluster_backend import ClusterBackend
from funasr.utils.modelscope_utils import get_cache_dir
@@ -818,7 +818,15 @@
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
- sv_model_file = asr_model_file.replace("model.pb", "campplus_cn_common.bin")
+ sv_model_config_path = asr_model_file.replace("model.pb", "sv_model_config.yaml")
+ if not os.path.exists(sv_model_config_path):
+ sv_model_config = {'sv_model_class': 'CAMPPlus','sv_model_file': 'campplus_cn_common.bin', 'models_config': {}}
+ else:
+ with open(sv_model_config_path, 'r') as f:
+ sv_model_config = yaml.load(f, Loader=yaml.FullLoader)
+ if sv_model_config['models_config'] is None:
+ sv_model_config['models_config'] = {}
+ sv_model_file = asr_model_file.replace("model.pb", sv_model_config['sv_model_file'])
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
@@ -944,9 +952,15 @@
##### speaker_verification #####
##################################
# load sv model
- sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu'))
- sv_model = CAMPPlus()
+ if ngpu > 0:
+ sv_model_dict = torch.load(sv_model_file)
+ sv_model = getattr(sv_module, sv_model_config['sv_model_class'])(**sv_model_config['models_config'])
+ sv_model.cuda()
+ else:
+ sv_model_dict = torch.load(sv_model_file, map_location=torch.device('cpu'))
+ sv_model = getattr(sv_module, sv_model_config['sv_model_class'])(**sv_model_config['models_config'])
sv_model.load_state_dict(sv_model_dict)
+ print(f'load sv model params: {sv_model_file}')
sv_model.eval()
cb_model = ClusterBackend()
vad_segments = []
@@ -956,24 +970,31 @@
ed = int(vadsegment[1]) / 1000
vad_segments.append(
[st, ed, audio[int(st * 16000):int(ed * 16000)]])
- check_audio_list(vad_segments)
- # sv pipeline
- segments = sv_chunk(vad_segments)
- embeddings = []
- for s in segments:
- #_, embs = self.sv_pipeline([s[2]], output_emb=True)
- # embeddings.append(embs)
- wavs = sv_preprocess([s[2]])
- # embs = self.forward(wavs)
- embs = []
- for x in wavs:
- x = extract_feature([x])
- embs.append(sv_model(x))
- embs = torch.cat(embs)
- embeddings.append(embs.detach().numpy())
- embeddings = np.concatenate(embeddings)
- labels = cb_model(embeddings)
- sv_output = postprocess(segments, vad_segments, labels, embeddings)
+ audio_dur = check_audio_list(vad_segments)
+ if audio_dur > 5:
+ # sv pipeline
+ segments = sv_chunk(vad_segments)
+ embeddings = []
+ for s in segments:
+ #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+ # embeddings.append(embs)
+ wavs = sv_preprocess([s[2]])
+ # embs = self.forward(wavs)
+ embs = []
+ for x in wavs:
+ x = extract_feature([x])
+ if ngpu > 0:
+ x = x.cuda()
+ embs.append(sv_model(x))
+ embs = torch.cat(embs)
+ embeddings.append(embs.cpu().detach().numpy())
+ embeddings = np.concatenate(embeddings)
+ labels = cb_model(embeddings)
+ sv_output = postprocess(segments, vad_segments, labels, embeddings)
+ else:
+ # fake speaker res for too shot utterance
+ sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+ logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))
speech, speech_lengths = batch["speech"], batch["speech_lengths"]
--
Gitblit v1.9.1