import codecs import sys from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks import numpy as np import os import soundfile data_path = sys.argv[1] segment_file_path = data_path + "/segments_nooverlap" utt2spk_file_path = data_path + "/utt2spk_nooverlap" wav_scp_path = data_path + "/wav.scp" cluster_emb_dir = data_path + '/cluster_embedding/' os.system("mkdir -p " + cluster_emb_dir) cluster_profile_dir = data_path + '/cluster_profile_zeropadding16/' os.system('mkdir -p ' + cluster_profile_dir) utt2spk = {} spk2seg = {} with codecs.open(utt2spk_file_path, "r", "utf-8") as f1: with codecs.open(segment_file_path, "r", "utf-8") as f2: for line in f1.readlines(): uttid, spkid = line.strip().split(" ") utt2spk[uttid] = spkid for line in f2.readlines(): uttid, sessionid, stime, etime = line.strip().split(" ") spkid = utt2spk[uttid] if spkid not in spk2seg.keys(): spk2seg[spkid] = [(int(float(stime) * 16000), int(float(etime) * 16000) - int(float(stime) * 16000))] else: spk2seg[spkid].append((int(float(stime) * 16000), int(float(etime) * 16000) - int(float(stime) * 16000))) inference_sv_pipline = pipeline( task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch', device='gpu' ) wav_dict = {} with codecs.open(wav_scp_path, "r", "utf-8") as fi: with codecs.open(data_path + "/cluster_embedding.scp", "w", "utf-8") as fo: for line in fi.readlines(): sessionid, wav_path = line.strip().split() wav_dict[sessionid] = wav_path for spkid, segs in spk2seg.items(): sessionid = spkid.split("-")[0] wav_path = wav_dict[sessionid] wav = soundfile.read(wav_path)[0] if wav.ndim == 2: wav = wav[:, 0] all_seg_embedding_list=[] for seg in segs: if seg[0] < wav.shape[0] - 0.5 * 16000: if seg[1] > wav.shape[0]: cur_seg_embedding = inference_sv_pipline(audio_in=wav[seg[0]: ])["spk_embedding"] else: cur_seg_embedding = inference_sv_pipline(audio_in=wav[seg[0]: seg[0] + seg[1]])["spk_embedding"] all_seg_embedding_list.append(cur_seg_embedding) all_seg_embedding = np.vstack(all_seg_embedding_list) spk_embedding = np.mean(all_seg_embedding, axis=0) np.save(cluster_emb_dir + spkid + '.npy', spk_embedding) fo.write(spkid + ' ' + cluster_emb_dir + spkid + '.npy' + '\n') session2embs = {} with codecs.open(data_path + "/cluster_embedding.scp", "r", "utf-8") as fi: with codecs.open(data_path + "/cluster_profile_zeropadding16.scp", "w", "utf-8") as fo: for line in fi.readlines(): spkid, emb_path = line.strip().split(" ") sessionid = spkid.split("-")[0] if sessionid not in session2embs.keys(): session2embs[sessionid] = [emb_path] else: session2embs[sessionid].append(emb_path) for sessionid, embs in session2embs.items(): emb_list = [np.load(x) for x in embs] tmp = [] for i in range(len(emb_list) - 1): flag = True for j in range(i + 1, len(emb_list)): cos_sim = emb_list[i].dot(emb_list[j]) / (np.linalg.norm(emb_list[i]) * np.linalg.norm(emb_list[j])) if cos_sim > 0.99: flag = False if flag: tmp.append(emb_list[i][np.newaxis, :]) tmp.append(emb_list[-1][np.newaxis, :]) emb_list = tmp # tmp = [] # for i in range(len(emb_list)): # for emb in tmp: # cos_sim = emb_list[i].dot(emb_list[j]) / (np.linalg.norm(emb_list[i]) * np.linalg.norm(emb_list[j])) # if cos_sim > 0.99: # flag = False # if flag: # tmp.append(emb_list[i][np.newaxis, :]) # emb_list = tmp for i in range(16 - len(emb_list)): emb_list.append(np.zeros((1, 256))) emb = np.concatenate(emb_list, axis=0) save_path = cluster_profile_dir + sessionid + ".npy" np.save(save_path, emb) fo.write("%s %s\n" % (sessionid, save_path))