| | |
| | | import numpy as np |
| | | import torch |
| | | import torchaudio |
| | | import soundfile |
| | | # import librosa |
| | | import librosa |
| | | import yaml |
| | | |
| | | from funasr.bin.asr_infer import Speech2Text |
| | |
| | | ed = int(vadsegment[1]) / 1000 |
| | | vad_segments.append( |
| | | [st, ed, audio[int(st * 16000):int(ed * 16000)]]) |
| | | check_audio_list(vad_segments) |
| | | # sv pipeline |
| | | segments = sv_chunk(vad_segments) |
| | | embeddings = [] |
| | | for s in segments: |
| | | #_, embs = self.sv_pipeline([s[2]], output_emb=True) |
| | | # embeddings.append(embs) |
| | | wavs = sv_preprocess([s[2]]) |
| | | # embs = self.forward(wavs) |
| | | embs = [] |
| | | for x in wavs: |
| | | x = extract_feature([x]) |
| | | embs.append(sv_model(x)) |
| | | embs = torch.cat(embs) |
| | | embeddings.append(embs.detach().numpy()) |
| | | embeddings = np.concatenate(embeddings) |
| | | labels = cb_model(embeddings) |
| | | sv_output = postprocess(segments, vad_segments, labels, embeddings) |
| | | audio_dur = check_audio_list(vad_segments) |
| | | if audio_dur > 5: |
| | | # sv pipeline |
| | | segments = sv_chunk(vad_segments) |
| | | embeddings = [] |
| | | for s in segments: |
| | | #_, embs = self.sv_pipeline([s[2]], output_emb=True) |
| | | # embeddings.append(embs) |
| | | wavs = sv_preprocess([s[2]]) |
| | | # embs = self.forward(wavs) |
| | | embs = [] |
| | | for x in wavs: |
| | | x = extract_feature([x]) |
| | | embs.append(sv_model(x)) |
| | | embs = torch.cat(embs) |
| | | embeddings.append(embs.detach().numpy()) |
| | | embeddings = np.concatenate(embeddings) |
| | | labels = cb_model(embeddings) |
| | | sv_output = postprocess(segments, vad_segments, labels, embeddings) |
| | | else: |
| | | # fake speaker res for too shot utterance |
| | | sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]] |
| | | logging.warning("Too short utterence found: {}, return default speaker results.".format(keys)) |
| | | |
| | | speech, speech_lengths = batch["speech"], batch["speech_lengths"] |
| | | |
| | |
| | | try: |
| | | raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0] |
| | | except: |
| | | raw_inputs = soundfile.read(data_path_and_name_and_type[0], dtype='float32')[0] |
| | | # raw_inputs = librosa.load(data_path_and_name_and_type[0], dtype='float32')[0] |
| | | raw_inputs, sr = librosa.load(data_path_and_name_and_type[0], dtype='float32') |
| | | if raw_inputs.ndim == 2: |
| | | raw_inputs = raw_inputs[:, 0] |
| | | raw_inputs = torch.tensor(raw_inputs) |