| | |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.models.base_model import FunASRModel |
| | | from funasr.models.frontend.wav_frontend import WavFrontendMel23 |
| | | from funasr.modules.eend_ola.encoder import EENDOLATransformerEncoder |
| | | from funasr.modules.eend_ola.encoder_decoder_attractor import EncoderDecoderAttractor |
| | | from funasr.modules.eend_ola.utils.losses import fast_batch_pit_n_speaker_loss, standard_loss, cal_power_loss |
| | | from funasr.modules.eend_ola.utils.losses import standard_loss, cal_power_loss, fast_batch_pit_n_speaker_loss |
| | | from funasr.modules.eend_ola.utils.power import create_powerlabel |
| | | from funasr.modules.eend_ola.utils.power import generate_mapping_dict |
| | | from funasr.torch_utils.device_funcs import force_gatherable |
| | |
| | | mapping_dict=None, |
| | | **kwargs, |
| | | ): |
| | | assert check_argument_types() |
| | | |
| | | super().__init__() |
| | | self.frontend = frontend |
| | | self.enc = encoder |
| | |
| | | def forward( |
| | | self, |
| | | speech: List[torch.Tensor], |
| | | speech_lengths: torch.Tensor, # num_frames of each sample |
| | | speaker_labels: List[torch.Tensor], |
| | | speaker_labels_lengths: torch.Tensor, # num_speakers of each sample |
| | | orders: torch.Tensor, |
| | | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: |
| | | |
| | | # Check that batch_size is unified |
| | | assert ( |
| | | len(speech) |
| | | == len(speech_lengths) |
| | | == len(speaker_labels) |
| | | == len(speaker_labels_lengths) |
| | | ), (len(speech), len(speech_lengths), len(speaker_labels), len(speaker_labels_lengths)) |
| | | assert (len(speech) == len(speaker_labels)), (len(speech), len(speaker_labels)) |
| | | speech_lengths = torch.tensor([len(sph) for sph in speech]).to(torch.int64) |
| | | speaker_labels_lengths = torch.tensor([spk.shape[-1] for spk in speaker_labels]).to(torch.int64) |
| | | batch_size = len(speech) |
| | | |
| | | # Encoder |
| | | speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)] |
| | | encoder_out = self.forward_encoder(speech, speech_lengths) |
| | | |
| | | # Encoder-decoder attractor |
| | |
| | | |
| | | def estimate_sequential(self, |
| | | speech: torch.Tensor, |
| | | speech_lengths: torch.Tensor, |
| | | n_speakers: int = None, |
| | | shuffle: bool = True, |
| | | threshold: float = 0.5, |
| | | **kwargs): |
| | | speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)] |
| | | speech_lengths = torch.tensor([len(sph) for sph in speech]).to(torch.int64) |
| | | emb = self.forward_encoder(speech, speech_lengths) |
| | | if shuffle: |
| | | orders = [np.arange(e.shape[0]) for e in emb] |