From 2868fe3df4e92a6ae3e327faf6e57ea492e04124 Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期四, 16 三月 2023 19:24:21 +0800
Subject: [PATCH] Merge branch 'main' into dev_dzh
---
funasr/modules/eend_ola/utils/losses.py | 67 +++++++++++++++++++++++++++++++++
1 files changed, 67 insertions(+), 0 deletions(-)
diff --git a/funasr/modules/eend_ola/utils/losses.py b/funasr/modules/eend_ola/utils/losses.py
new file mode 100644
index 0000000..af0181d
--- /dev/null
+++ b/funasr/modules/eend_ola/utils/losses.py
@@ -0,0 +1,67 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from itertools import permutations
+from torch import nn
+
+
+def standard_loss(ys, ts, label_delay=0):
+ losses = [F.binary_cross_entropy(torch.sigmoid(y), t) * len(y) for y, t in zip(ys, ts)]
+ loss = torch.sum(torch.stack(losses))
+ n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(torch.float32).to(ys[0].device)
+ loss = loss / n_frames
+ return loss
+
+
+def batch_pit_n_speaker_loss(ys, ts, n_speakers_list):
+ max_n_speakers = ts[0].shape[1]
+ olens = [y.shape[0] for y in ys]
+ ys = nn.utils.rnn.pad_sequence(ys, batch_first=True, padding_value=-1)
+ ys_mask = [torch.ones(olen).to(ys.device) for olen in olens]
+ ys_mask = torch.nn.utils.rnn.pad_sequence(ys_mask, batch_first=True, padding_value=0).unsqueeze(-1)
+
+ losses = []
+ for shift in range(max_n_speakers):
+ ts_roll = [torch.roll(t, -shift, dims=1) for t in ts]
+ ts_roll = nn.utils.rnn.pad_sequence(ts_roll, batch_first=True, padding_value=-1)
+ loss = F.binary_cross_entropy(torch.sigmoid(ys), ts_roll, reduction='none')
+ if ys_mask is not None:
+ loss = loss * ys_mask
+ loss = torch.sum(loss, dim=1)
+ losses.append(loss)
+ losses = torch.stack(losses, dim=2)
+
+ perms = np.array(list(permutations(range(max_n_speakers)))).astype(np.float32)
+ perms = torch.from_numpy(perms).to(losses.device)
+ y_ind = torch.arange(max_n_speakers, dtype=torch.float32, device=losses.device)
+ t_inds = torch.fmod(perms - y_ind, max_n_speakers).to(torch.long)
+
+ losses_perm = []
+ for t_ind in t_inds:
+ losses_perm.append(
+ torch.mean(losses[:, y_ind.to(torch.long), t_ind], dim=1))
+ losses_perm = torch.stack(losses_perm, dim=1)
+
+ def select_perm_indices(num, max_num):
+ perms = list(permutations(range(max_num)))
+ sub_perms = list(permutations(range(num)))
+ return [
+ [x[:num] for x in perms].index(perm)
+ for perm in sub_perms]
+
+ masks = torch.full_like(losses_perm, device=losses.device, fill_value=float('inf'))
+ for i, t in enumerate(ts):
+ n_speakers = n_speakers_list[i]
+ indices = select_perm_indices(n_speakers, max_n_speakers)
+ masks[i, indices] = 0
+ losses_perm += masks
+
+ min_loss = torch.sum(torch.min(losses_perm, dim=1)[0])
+ n_frames = torch.from_numpy(np.array(np.sum([t.shape[0] for t in ts]))).to(losses.device)
+ min_loss = min_loss / n_frames
+
+ min_indices = torch.argmin(losses_perm, dim=1)
+ labels_perm = [t[:, perms[idx].to(torch.long)] for t, idx in zip(ts, min_indices)]
+ labels_perm = [t[:, :n_speakers] for t, n_speakers in zip(labels_perm, n_speakers_list)]
+
+ return min_loss, labels_perm
--
Gitblit v1.9.1