From 91425c670b21fa244f739885d34b88742272747c Mon Sep 17 00:00:00 2001
From: 嘉渊 <wangjiaming.wjm@alibaba-inc.com>
Date: 星期四, 06 七月 2023 17:54:29 +0800
Subject: [PATCH] update eend_ola
---
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml | 2 +-
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml | 2 +-
egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml | 2 +-
egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml | 2 +-
funasr/models/e2e_diar_eend_ola.py | 14 ++++----------
5 files changed, 8 insertions(+), 14 deletions(-)
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml
index 71ea9f0..cd143f7 100644
--- a/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_callhome_chunk2000.yaml
@@ -12,7 +12,7 @@
n_units: 256
# model related
-model: eend_ola_similar_eend
+model: eend_ola
model_conf:
attractor_loss_weight: 0.01
max_n_speaker: 8
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml
index baf4342..47316fe 100644
--- a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_2spkr.yaml
@@ -12,7 +12,7 @@
n_units: 256
# model related
-model: eend_ola_similar_eend
+model: eend_ola
model_conf:
max_n_speaker: 8
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml
index 83a6eee..f55e148 100644
--- a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr.yaml
@@ -12,7 +12,7 @@
n_units: 256
# model related
-model: eend_ola_similar_eend
+model: eend_ola
model_conf:
max_n_speaker: 8
diff --git a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
index f478504..d21d467 100644
--- a/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
+++ b/egs/callhome/eend_ola/conf/train_diar_eend_ola_simu_allspkr_chunk2000.yaml
@@ -12,7 +12,7 @@
n_units: 256
# model related
-model: eend_ola_similar_eend
+model: eend_ola
model_conf:
max_n_speaker: 8
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index af0fd62..fda24e2 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -12,7 +12,7 @@
from funasr.models.frontend.wav_frontend import WavFrontendMel23
from funasr.modules.eend_ola.encoder import EENDOLATransformerEncoder
from funasr.modules.eend_ola.encoder_decoder_attractor import EncoderDecoderAttractor
-from funasr.modules.eend_ola.utils.losses import fast_batch_pit_n_speaker_loss, standard_loss, cal_power_loss
+from funasr.modules.eend_ola.utils.losses import standard_loss, cal_power_loss, fast_batch_pit_n_speaker_loss
from funasr.modules.eend_ola.utils.power import create_powerlabel
from funasr.modules.eend_ola.utils.power import generate_mapping_dict
from funasr.torch_utils.device_funcs import force_gatherable
@@ -109,23 +109,17 @@
def forward(
self,
speech: List[torch.Tensor],
- speech_lengths: torch.Tensor, # num_frames of each sample
speaker_labels: List[torch.Tensor],
- speaker_labels_lengths: torch.Tensor, # num_speakers of each sample
orders: torch.Tensor,
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
# Check that batch_size is unified
- assert (
- len(speech)
- == len(speech_lengths)
- == len(speaker_labels)
- == len(speaker_labels_lengths)
- ), (len(speech), len(speech_lengths), len(speaker_labels), len(speaker_labels_lengths))
+ assert (len(speech) == len(speaker_labels)), (len(speech), len(speaker_labels))
+ speech_lengths = torch.tensor([len(sph) for sph in speech]).to(torch.int64)
+ speaker_labels_lengths = torch.tensor([spk.shape[-1] for spk in speaker_labels]).to(torch.int64)
batch_size = len(speech)
# Encoder
- speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
encoder_out = self.forward_encoder(speech, speech_lengths)
# Encoder-decoder attractor
--
Gitblit v1.9.1