| New file |
| | |
| | | import numpy as np |
| | | from funasr.utils.job_runner import MultiProcessRunnerV3 |
| | | from funasr.utils.misc import load_scp_as_list, load_scp_as_dict |
| | | import os |
| | | import argparse |
| | | |
| | | |
| | | class MyRunner(MultiProcessRunnerV3): |
| | | |
| | | def prepare(self, parser): |
| | | parser.add_argument("--rttm_scp", type=str) |
| | | parser.add_argument("--seg_file", type=str) |
| | | args = parser.parse_args() |
| | | |
| | | if not os.path.exists(os.path.dirname(args.seg_file)): |
| | | os.makedirs(os.path.dirname(args.seg_file)) |
| | | |
| | | task_list = load_scp_as_list(args.rttm_scp) |
| | | return task_list, None, args |
| | | |
| | | def post(self, results_list, args): |
| | | with open(args.seg_file, "wt", encoding="utf-8") as fd: |
| | | for results in results_list: |
| | | fd.writelines(results) |
| | | |
| | | |
| | | def process(task_args): |
| | | _, task_list, _, args = task_args |
| | | outputs = [] |
| | | for mid, rttm_path in task_list: |
| | | spk_turns = [] |
| | | length = 0 |
| | | for one_line in open(rttm_path, 'rt', encoding="utf-8"): |
| | | parts = one_line.strip().split(" ") |
| | | _, st, dur, spk_name = parts[1], float(parts[3]), float(parts[4]), parts[7] |
| | | st, ed = int(st*100), int((st + dur)*100) |
| | | length = ed if ed > length else length |
| | | spk_turns.append([mid, st, ed, spk_name]) |
| | | is_sph = np.zeros((length+1, ), dtype=bool) |
| | | for _, st, ed, _ in spk_turns: |
| | | is_sph[st:ed] = True |
| | | |
| | | st, in_speech = 0, False |
| | | for i in range(length+1): |
| | | if not in_speech and is_sph[i]: |
| | | st, in_speech = i, True |
| | | if in_speech and not is_sph[i]: |
| | | in_speech = False |
| | | outputs.append("{}-{:07d}-{:07d} {} {:.2f} {:.2f}\n".format( |
| | | mid, st, i, mid, float(st)/100, float(i)/100 |
| | | )) |
| | | return outputs |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | my_runner = MyRunner(process) |
| | | my_runner.run() |
| New file |
| | |
| | | #!/usr/bin/env python3 |
| | | |
| | | import os |
| | | |
| | | from funasr.tasks.diar import DiarTask |
| | | |
| | | |
| | | # for ASR Training |
| | | def parse_args(): |
| | | parser = DiarTask.get_parser() |
| | | parser.add_argument( |
| | | "--gpu_id", |
| | | type=int, |
| | | default=0, |
| | | help="local gpu id.", |
| | | ) |
| | | args = parser.parse_args() |
| | | return args |
| | | |
| | | |
| | | def main(args=None, cmd=None): |
| | | # for ASR Training |
| | | DiarTask.main(args=args, cmd=cmd) |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | args = parse_args() |
| | | |
| | | # setup local gpu_id |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id) |
| | | |
| | | # DDP settings |
| | | if args.ngpu > 1: |
| | | args.distributed = True |
| | | else: |
| | | args.distributed = False |
| | | assert args.num_worker_count == 1 |
| | | |
| | | # re-compute batch size: when dataset type is small |
| | | if args.dataset_type == "small": |
| | | if args.batch_size is not None: |
| | | args.batch_size = args.batch_size * args.ngpu |
| | | if args.batch_bins is not None: |
| | | args.batch_bins = args.batch_bins * args.ngpu |
| | | |
| | | main(args=args) |
| | |
| | | ) |
| | | self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss) |
| | | self.pse_embedding = self.generate_pse_embedding() |
| | | self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :]) |
| | | self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :]) |
| | | self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight |
| | | self.inter_score_loss_weight = inter_score_loss_weight |
| | | |
| | |
| | | speech_lengths: torch.Tensor = None, |
| | | profile: torch.Tensor = None, |
| | | profile_lengths: torch.Tensor = None, |
| | | spk_labels: torch.Tensor = None, |
| | | spk_labels_lengths: torch.Tensor = None, |
| | | binary_labels: torch.Tensor = None, |
| | | binary_labels_lengths: torch.Tensor = None, |
| | | ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]: |
| | | """Frontend + Encoder + Speaker Encoder + CI Scorer + CD Scorer + Decoder + Calc loss |
| | | |
| | |
| | | espnet2/iterators/chunk_iter_factory.py |
| | | profile: (Batch, N_spk, dim) |
| | | profile_lengths: (Batch,) |
| | | spk_labels: (Batch, frames, input_size) |
| | | spk_labels_lengths: (Batch,) |
| | | binary_labels: (Batch, frames, max_spk_num) |
| | | binary_labels_lengths: (Batch,) |
| | | """ |
| | | assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape) |
| | | assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape) |
| | | batch_size = speech.shape[0] |
| | | |
| | | # 1. Network forward |
| | |
| | | |
| | | # 2. Aggregate time-domain labels to match forward outputs |
| | | if self.label_aggregator is not None: |
| | | spk_labels, spk_labels_lengths = self.label_aggregator( |
| | | spk_labels.unsqueeze(2), spk_labels_lengths |
| | | binary_labels, binary_labels_lengths = self.label_aggregator( |
| | | binary_labels, binary_labels_lengths |
| | | ) |
| | | spk_labels = spk_labels.squeeze(2) |
| | | # 2. Calculate power-set encoding (PSE) labels |
| | | raw_pse_labels = torch.sum(binary_labels * self.power_weight, dim=2, keepdim=True) |
| | | pse_labels = torch.argmax(raw_pse_labels == self.int_token_arr, dim=2) |
| | | |
| | | # If encoder uses conv* as input_layer (i.e., subsampling), |
| | | # the sequence length of 'pred' might be slightly less than the |
| | | # length of 'spk_labels'. Here we force them to be equal. |
| | | length_diff_tolerance = 2 |
| | | length_diff = spk_labels.shape[1] - pred.shape[1] |
| | | length_diff = pse_labels.shape[1] - pred.shape[1] |
| | | if 0 < length_diff <= length_diff_tolerance: |
| | | spk_labels = spk_labels[:, 0: pred.shape[1], :] |
| | | pse_labels = pse_labels[:, 0: pred.shape[1]] |
| | | |
| | | loss_diar = self.classification_loss(pred, spk_labels, spk_labels_lengths) |
| | | loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths) |
| | | loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths) |
| | | loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, spk_labels, spk_labels_lengths) |
| | | label_mask = make_pad_mask(spk_labels_lengths, maxlen=spk_labels.shape[1]) |
| | | loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, pse_labels, binary_labels_lengths) |
| | | label_mask = make_pad_mask(binary_labels_lengths, maxlen=pse_labels.shape[1]) |
| | | loss = (loss_diar + self.speaker_discrimination_loss_weight * loss_spk_dis |
| | | + self.inter_score_loss_weight * (loss_inter_ci + loss_inter_cd)) |
| | | |
| | |
| | | speaker_error, |
| | | ) = self.calc_diarization_error( |
| | | pred=F.embedding(pred.argmax(dim=2) * label_mask, self.pse_embedding), |
| | | label=F.embedding(spk_labels * label_mask, self.pse_embedding), |
| | | length=spk_labels_lengths |
| | | label=F.embedding(pse_labels * label_mask, self.pse_embedding), |
| | | length=binary_labels_lengths |
| | | ) |
| | | |
| | | if speech_scored > 0 and num_frames > 0: |
| New file |
| | |
| | | import math |
| | | import torch |
| | | import torch.nn as nn |
| | | import torch.nn.functional as F |
| | | |
| | | |
| | | class _BatchNorm1d(nn.Module): |
| | | def __init__( |
| | | self, |
| | | input_shape=None, |
| | | input_size=None, |
| | | eps=1e-05, |
| | | momentum=0.1, |
| | | affine=True, |
| | | track_running_stats=True, |
| | | combine_batch_time=False, |
| | | skip_transpose=False, |
| | | ): |
| | | super().__init__() |
| | | self.combine_batch_time = combine_batch_time |
| | | self.skip_transpose = skip_transpose |
| | | |
| | | if input_size is None and skip_transpose: |
| | | input_size = input_shape[1] |
| | | elif input_size is None: |
| | | input_size = input_shape[-1] |
| | | |
| | | self.norm = nn.BatchNorm1d( |
| | | input_size, |
| | | eps=eps, |
| | | momentum=momentum, |
| | | affine=affine, |
| | | track_running_stats=track_running_stats, |
| | | ) |
| | | |
| | | def forward(self, x): |
| | | shape_or = x.shape |
| | | if self.combine_batch_time: |
| | | if x.ndim == 3: |
| | | x = x.reshape(shape_or[0] * shape_or[1], shape_or[2]) |
| | | else: |
| | | x = x.reshape( |
| | | shape_or[0] * shape_or[1], shape_or[3], shape_or[2] |
| | | ) |
| | | |
| | | elif not self.skip_transpose: |
| | | x = x.transpose(-1, 1) |
| | | |
| | | x_n = self.norm(x) |
| | | |
| | | if self.combine_batch_time: |
| | | x_n = x_n.reshape(shape_or) |
| | | elif not self.skip_transpose: |
| | | x_n = x_n.transpose(1, -1) |
| | | |
| | | return x_n |
| | | |
| | | |
| | | class _Conv1d(nn.Module): |
| | | def __init__( |
| | | self, |
| | | out_channels, |
| | | kernel_size, |
| | | input_shape=None, |
| | | in_channels=None, |
| | | stride=1, |
| | | dilation=1, |
| | | padding="same", |
| | | groups=1, |
| | | bias=True, |
| | | padding_mode="reflect", |
| | | skip_transpose=False, |
| | | ): |
| | | super().__init__() |
| | | self.kernel_size = kernel_size |
| | | self.stride = stride |
| | | self.dilation = dilation |
| | | self.padding = padding |
| | | self.padding_mode = padding_mode |
| | | self.unsqueeze = False |
| | | self.skip_transpose = skip_transpose |
| | | |
| | | if input_shape is None and in_channels is None: |
| | | raise ValueError("Must provide one of input_shape or in_channels") |
| | | |
| | | if in_channels is None: |
| | | in_channels = self._check_input_shape(input_shape) |
| | | |
| | | self.conv = nn.Conv1d( |
| | | in_channels, |
| | | out_channels, |
| | | self.kernel_size, |
| | | stride=self.stride, |
| | | dilation=self.dilation, |
| | | padding=0, |
| | | groups=groups, |
| | | bias=bias, |
| | | ) |
| | | |
| | | def forward(self, x): |
| | | if not self.skip_transpose: |
| | | x = x.transpose(1, -1) |
| | | |
| | | if self.unsqueeze: |
| | | x = x.unsqueeze(1) |
| | | |
| | | if self.padding == "same": |
| | | x = self._manage_padding( |
| | | x, self.kernel_size, self.dilation, self.stride |
| | | ) |
| | | |
| | | elif self.padding == "causal": |
| | | num_pad = (self.kernel_size - 1) * self.dilation |
| | | x = F.pad(x, (num_pad, 0)) |
| | | |
| | | elif self.padding == "valid": |
| | | pass |
| | | |
| | | else: |
| | | raise ValueError( |
| | | "Padding must be 'same', 'valid' or 'causal'. Got " |
| | | + self.padding |
| | | ) |
| | | |
| | | wx = self.conv(x) |
| | | |
| | | if self.unsqueeze: |
| | | wx = wx.squeeze(1) |
| | | |
| | | if not self.skip_transpose: |
| | | wx = wx.transpose(1, -1) |
| | | |
| | | return wx |
| | | |
| | | def _manage_padding( |
| | | self, x, kernel_size: int, dilation: int, stride: int, |
| | | ): |
| | | # Detecting input shape |
| | | L_in = x.shape[-1] |
| | | |
| | | # Time padding |
| | | padding = get_padding_elem(L_in, stride, kernel_size, dilation) |
| | | |
| | | # Applying padding |
| | | x = F.pad(x, padding, mode=self.padding_mode) |
| | | |
| | | return x |
| | | |
| | | def _check_input_shape(self, shape): |
| | | """Checks the input shape and returns the number of input channels. |
| | | """ |
| | | |
| | | if len(shape) == 2: |
| | | self.unsqueeze = True |
| | | in_channels = 1 |
| | | elif self.skip_transpose: |
| | | in_channels = shape[1] |
| | | elif len(shape) == 3: |
| | | in_channels = shape[2] |
| | | else: |
| | | raise ValueError( |
| | | "conv1d expects 2d, 3d inputs. Got " + str(len(shape)) |
| | | ) |
| | | |
| | | # Kernel size must be odd |
| | | if self.kernel_size % 2 == 0: |
| | | raise ValueError( |
| | | "The field kernel size must be an odd number. Got %s." |
| | | % (self.kernel_size) |
| | | ) |
| | | return in_channels |
| | | |
| | | |
| | | def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int): |
| | | if stride > 1: |
| | | n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) |
| | | L_out = stride * (n_steps - 1) + kernel_size * dilation |
| | | padding = [kernel_size // 2, kernel_size // 2] |
| | | |
| | | else: |
| | | L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 |
| | | |
| | | padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] |
| | | return padding |
| | | |
| | | |
| | | # Skip transpose as much as possible for efficiency |
| | | class Conv1d(_Conv1d): |
| | | def __init__(self, *args, **kwargs): |
| | | super().__init__(skip_transpose=True, *args, **kwargs) |
| | | |
| | | |
| | | class BatchNorm1d(_BatchNorm1d): |
| | | def __init__(self, *args, **kwargs): |
| | | super().__init__(skip_transpose=True, *args, **kwargs) |
| | | |
| | | |
| | | def length_to_mask(length, max_len=None, dtype=None, device=None): |
| | | assert len(length.shape) == 1 |
| | | |
| | | if max_len is None: |
| | | max_len = length.max().long().item() # using arange to generate mask |
| | | mask = torch.arange( |
| | | max_len, device=length.device, dtype=length.dtype |
| | | ).expand(len(length), max_len) < length.unsqueeze(1) |
| | | |
| | | if dtype is None: |
| | | dtype = length.dtype |
| | | |
| | | if device is None: |
| | | device = length.device |
| | | |
| | | mask = torch.as_tensor(mask, dtype=dtype, device=device) |
| | | return mask |
| | | |
| | | |
| | | class TDNNBlock(nn.Module): |
| | | def __init__( |
| | | self, |
| | | in_channels, |
| | | out_channels, |
| | | kernel_size, |
| | | dilation, |
| | | activation=nn.ReLU, |
| | | groups=1, |
| | | ): |
| | | super(TDNNBlock, self).__init__() |
| | | self.conv = Conv1d( |
| | | in_channels=in_channels, |
| | | out_channels=out_channels, |
| | | kernel_size=kernel_size, |
| | | dilation=dilation, |
| | | groups=groups, |
| | | ) |
| | | self.activation = activation() |
| | | self.norm = BatchNorm1d(input_size=out_channels) |
| | | |
| | | def forward(self, x): |
| | | return self.norm(self.activation(self.conv(x))) |
| | | |
| | | |
| | | class Res2NetBlock(torch.nn.Module): |
| | | """An implementation of Res2NetBlock w/ dilation. |
| | | |
| | | Arguments |
| | | --------- |
| | | in_channels : int |
| | | The number of channels expected in the input. |
| | | out_channels : int |
| | | The number of output channels. |
| | | scale : int |
| | | The scale of the Res2Net block. |
| | | kernel_size: int |
| | | The kernel size of the Res2Net block. |
| | | dilation : int |
| | | The dilation of the Res2Net block. |
| | | |
| | | Example |
| | | ------- |
| | | >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) |
| | | >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3) |
| | | >>> out_tensor = layer(inp_tensor).transpose(1, 2) |
| | | >>> out_tensor.shape |
| | | torch.Size([8, 120, 64]) |
| | | """ |
| | | |
| | | def __init__( |
| | | self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1 |
| | | ): |
| | | super(Res2NetBlock, self).__init__() |
| | | assert in_channels % scale == 0 |
| | | assert out_channels % scale == 0 |
| | | |
| | | in_channel = in_channels // scale |
| | | hidden_channel = out_channels // scale |
| | | |
| | | self.blocks = nn.ModuleList( |
| | | [ |
| | | TDNNBlock( |
| | | in_channel, |
| | | hidden_channel, |
| | | kernel_size=kernel_size, |
| | | dilation=dilation, |
| | | ) |
| | | for i in range(scale - 1) |
| | | ] |
| | | ) |
| | | self.scale = scale |
| | | |
| | | def forward(self, x): |
| | | y = [] |
| | | for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)): |
| | | if i == 0: |
| | | y_i = x_i |
| | | elif i == 1: |
| | | y_i = self.blocks[i - 1](x_i) |
| | | else: |
| | | y_i = self.blocks[i - 1](x_i + y_i) |
| | | y.append(y_i) |
| | | y = torch.cat(y, dim=1) |
| | | return y |
| | | |
| | | |
| | | class SEBlock(nn.Module): |
| | | """An implementation of squeeze-and-excitation block. |
| | | |
| | | Arguments |
| | | --------- |
| | | in_channels : int |
| | | The number of input channels. |
| | | se_channels : int |
| | | The number of output channels after squeeze. |
| | | out_channels : int |
| | | The number of output channels. |
| | | |
| | | Example |
| | | ------- |
| | | >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) |
| | | >>> se_layer = SEBlock(64, 16, 64) |
| | | >>> lengths = torch.rand((8,)) |
| | | >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2) |
| | | >>> out_tensor.shape |
| | | torch.Size([8, 120, 64]) |
| | | """ |
| | | |
| | | def __init__(self, in_channels, se_channels, out_channels): |
| | | super(SEBlock, self).__init__() |
| | | |
| | | self.conv1 = Conv1d( |
| | | in_channels=in_channels, out_channels=se_channels, kernel_size=1 |
| | | ) |
| | | self.relu = torch.nn.ReLU(inplace=True) |
| | | self.conv2 = Conv1d( |
| | | in_channels=se_channels, out_channels=out_channels, kernel_size=1 |
| | | ) |
| | | self.sigmoid = torch.nn.Sigmoid() |
| | | |
| | | def forward(self, x, lengths=None): |
| | | L = x.shape[-1] |
| | | if lengths is not None: |
| | | mask = length_to_mask(lengths * L, max_len=L, device=x.device) |
| | | mask = mask.unsqueeze(1) |
| | | total = mask.sum(dim=2, keepdim=True) |
| | | s = (x * mask).sum(dim=2, keepdim=True) / total |
| | | else: |
| | | s = x.mean(dim=2, keepdim=True) |
| | | |
| | | s = self.relu(self.conv1(s)) |
| | | s = self.sigmoid(self.conv2(s)) |
| | | |
| | | return s * x |
| | | |
| | | |
| | | class AttentiveStatisticsPooling(nn.Module): |
| | | """This class implements an attentive statistic pooling layer for each channel. |
| | | It returns the concatenated mean and std of the input tensor. |
| | | |
| | | Arguments |
| | | --------- |
| | | channels: int |
| | | The number of input channels. |
| | | attention_channels: int |
| | | The number of attention channels. |
| | | |
| | | Example |
| | | ------- |
| | | >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2) |
| | | >>> asp_layer = AttentiveStatisticsPooling(64) |
| | | >>> lengths = torch.rand((8,)) |
| | | >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2) |
| | | >>> out_tensor.shape |
| | | torch.Size([8, 1, 128]) |
| | | """ |
| | | |
| | | def __init__(self, channels, attention_channels=128, global_context=True): |
| | | super().__init__() |
| | | |
| | | self.eps = 1e-12 |
| | | self.global_context = global_context |
| | | if global_context: |
| | | self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) |
| | | else: |
| | | self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) |
| | | self.tanh = nn.Tanh() |
| | | self.conv = Conv1d( |
| | | in_channels=attention_channels, out_channels=channels, kernel_size=1 |
| | | ) |
| | | |
| | | def forward(self, x, lengths=None): |
| | | """Calculates mean and std for a batch (input tensor). |
| | | |
| | | Arguments |
| | | --------- |
| | | x : torch.Tensor |
| | | Tensor of shape [N, C, L]. |
| | | """ |
| | | L = x.shape[-1] |
| | | |
| | | def _compute_statistics(x, m, dim=2, eps=self.eps): |
| | | mean = (m * x).sum(dim) |
| | | std = torch.sqrt( |
| | | (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps) |
| | | ) |
| | | return mean, std |
| | | |
| | | if lengths is None: |
| | | lengths = torch.ones(x.shape[0], device=x.device) |
| | | |
| | | # Make binary mask of shape [N, 1, L] |
| | | mask = length_to_mask(lengths * L, max_len=L, device=x.device) |
| | | mask = mask.unsqueeze(1) |
| | | |
| | | # Expand the temporal context of the pooling layer by allowing the |
| | | # self-attention to look at global properties of the utterance. |
| | | if self.global_context: |
| | | # torch.std is unstable for backward computation |
| | | # https://github.com/pytorch/pytorch/issues/4320 |
| | | total = mask.sum(dim=2, keepdim=True).float() |
| | | mean, std = _compute_statistics(x, mask / total) |
| | | mean = mean.unsqueeze(2).repeat(1, 1, L) |
| | | std = std.unsqueeze(2).repeat(1, 1, L) |
| | | attn = torch.cat([x, mean, std], dim=1) |
| | | else: |
| | | attn = x |
| | | |
| | | # Apply layers |
| | | attn = self.conv(self.tanh(self.tdnn(attn))) |
| | | |
| | | # Filter out zero-paddings |
| | | attn = attn.masked_fill(mask == 0, float("-inf")) |
| | | |
| | | attn = F.softmax(attn, dim=2) |
| | | mean, std = _compute_statistics(x, attn) |
| | | # Append mean and std of the batch |
| | | pooled_stats = torch.cat((mean, std), dim=1) |
| | | pooled_stats = pooled_stats.unsqueeze(2) |
| | | |
| | | return pooled_stats |
| | | |
| | | |
| | | class SERes2NetBlock(nn.Module): |
| | | """An implementation of building block in ECAPA-TDNN, i.e., |
| | | TDNN-Res2Net-TDNN-SEBlock. |
| | | |
| | | Arguments |
| | | ---------- |
| | | out_channels: int |
| | | The number of output channels. |
| | | res2net_scale: int |
| | | The scale of the Res2Net block. |
| | | kernel_size: int |
| | | The kernel size of the TDNN blocks. |
| | | dilation: int |
| | | The dilation of the Res2Net block. |
| | | activation : torch class |
| | | A class for constructing the activation layers. |
| | | groups: int |
| | | Number of blocked connections from input channels to output channels. |
| | | |
| | | Example |
| | | ------- |
| | | >>> x = torch.rand(8, 120, 64).transpose(1, 2) |
| | | >>> conv = SERes2NetBlock(64, 64, res2net_scale=4) |
| | | >>> out = conv(x).transpose(1, 2) |
| | | >>> out.shape |
| | | torch.Size([8, 120, 64]) |
| | | """ |
| | | |
| | | def __init__( |
| | | self, |
| | | in_channels, |
| | | out_channels, |
| | | res2net_scale=8, |
| | | se_channels=128, |
| | | kernel_size=1, |
| | | dilation=1, |
| | | activation=torch.nn.ReLU, |
| | | groups=1, |
| | | ): |
| | | super().__init__() |
| | | self.out_channels = out_channels |
| | | self.tdnn1 = TDNNBlock( |
| | | in_channels, |
| | | out_channels, |
| | | kernel_size=1, |
| | | dilation=1, |
| | | activation=activation, |
| | | groups=groups, |
| | | ) |
| | | self.res2net_block = Res2NetBlock( |
| | | out_channels, out_channels, res2net_scale, kernel_size, dilation |
| | | ) |
| | | self.tdnn2 = TDNNBlock( |
| | | out_channels, |
| | | out_channels, |
| | | kernel_size=1, |
| | | dilation=1, |
| | | activation=activation, |
| | | groups=groups, |
| | | ) |
| | | self.se_block = SEBlock(out_channels, se_channels, out_channels) |
| | | |
| | | self.shortcut = None |
| | | if in_channels != out_channels: |
| | | self.shortcut = Conv1d( |
| | | in_channels=in_channels, |
| | | out_channels=out_channels, |
| | | kernel_size=1, |
| | | ) |
| | | |
| | | def forward(self, x, lengths=None): |
| | | residual = x |
| | | if self.shortcut: |
| | | residual = self.shortcut(x) |
| | | |
| | | x = self.tdnn1(x) |
| | | x = self.res2net_block(x) |
| | | x = self.tdnn2(x) |
| | | x = self.se_block(x, lengths) |
| | | |
| | | return x + residual |
| | | |
| | | |
| | | class ECAPA_TDNN(torch.nn.Module): |
| | | """An implementation of the speaker embedding model in a paper. |
| | | "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in |
| | | TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143). |
| | | |
| | | Arguments |
| | | --------- |
| | | device : str |
| | | Device used, e.g., "cpu" or "cuda". |
| | | activation : torch class |
| | | A class for constructing the activation layers. |
| | | channels : list of ints |
| | | Output channels for TDNN/SERes2Net layer. |
| | | kernel_sizes : list of ints |
| | | List of kernel sizes for each layer. |
| | | dilations : list of ints |
| | | List of dilations for kernels in each layer. |
| | | lin_neurons : int |
| | | Number of neurons in linear layers. |
| | | groups : list of ints |
| | | List of groups for kernels in each layer. |
| | | |
| | | Example |
| | | ------- |
| | | >>> input_feats = torch.rand([5, 120, 80]) |
| | | >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192) |
| | | >>> outputs = compute_embedding(input_feats) |
| | | >>> outputs.shape |
| | | torch.Size([5, 1, 192]) |
| | | """ |
| | | |
| | | def __init__( |
| | | self, |
| | | input_size, |
| | | device="cpu", |
| | | lin_neurons=192, |
| | | activation=torch.nn.ReLU, |
| | | channels=[512, 512, 512, 512, 1536], |
| | | kernel_sizes=[5, 3, 3, 3, 1], |
| | | dilations=[1, 2, 3, 4, 1], |
| | | attention_channels=128, |
| | | res2net_scale=8, |
| | | se_channels=128, |
| | | global_context=True, |
| | | groups=[1, 1, 1, 1, 1], |
| | | window_size=20, |
| | | window_shift=1, |
| | | ): |
| | | |
| | | super().__init__() |
| | | assert len(channels) == len(kernel_sizes) |
| | | assert len(channels) == len(dilations) |
| | | self.channels = channels |
| | | self.blocks = nn.ModuleList() |
| | | self.window_size = window_size |
| | | self.window_shift = window_shift |
| | | |
| | | # The initial TDNN layer |
| | | self.blocks.append( |
| | | TDNNBlock( |
| | | input_size, |
| | | channels[0], |
| | | kernel_sizes[0], |
| | | dilations[0], |
| | | activation, |
| | | groups[0], |
| | | ) |
| | | ) |
| | | |
| | | # SE-Res2Net layers |
| | | for i in range(1, len(channels) - 1): |
| | | self.blocks.append( |
| | | SERes2NetBlock( |
| | | channels[i - 1], |
| | | channels[i], |
| | | res2net_scale=res2net_scale, |
| | | se_channels=se_channels, |
| | | kernel_size=kernel_sizes[i], |
| | | dilation=dilations[i], |
| | | activation=activation, |
| | | groups=groups[i], |
| | | ) |
| | | ) |
| | | |
| | | # Multi-layer feature aggregation |
| | | self.mfa = TDNNBlock( |
| | | channels[-1], |
| | | channels[-1], |
| | | kernel_sizes[-1], |
| | | dilations[-1], |
| | | activation, |
| | | groups=groups[-1], |
| | | ) |
| | | |
| | | # Attentive Statistical Pooling |
| | | self.asp = AttentiveStatisticsPooling( |
| | | channels[-1], |
| | | attention_channels=attention_channels, |
| | | global_context=global_context, |
| | | ) |
| | | self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) |
| | | |
| | | # Final linear transformation |
| | | self.fc = Conv1d( |
| | | in_channels=channels[-1] * 2, |
| | | out_channels=lin_neurons, |
| | | kernel_size=1, |
| | | ) |
| | | |
| | | def windowed_pooling(self, x, lengths=None): |
| | | # x: Batch, Channel, Time |
| | | tt = x.shape[2] |
| | | num_chunk = int(math.ceil(tt / self.window_shift)) |
| | | pad = self.window_size // 2 |
| | | x = F.pad(x, (pad, pad, 0, 0), "reflect") |
| | | stat_list = [] |
| | | |
| | | for i in range(num_chunk): |
| | | # B x C |
| | | st, ed = i * self.window_shift, i * self.window_shift + self.window_size |
| | | x = self.asp(x[:, :, st: ed], |
| | | lengths=torch.clamp(lengths - i, 0, self.window_size) |
| | | if lengths is not None else None) |
| | | x = self.asp_bn(x) |
| | | x = self.fc(x) |
| | | stat_list.append(x) |
| | | |
| | | return torch.cat(stat_list, dim=2) |
| | | |
| | | def forward(self, x, lengths=None): |
| | | """Returns the embedding vector. |
| | | |
| | | Arguments |
| | | --------- |
| | | x : torch.Tensor |
| | | Tensor of shape (batch, time, channel). |
| | | lengths: torch.Tensor |
| | | Tensor of shape (batch, ) |
| | | """ |
| | | # Minimize transpose for efficiency |
| | | x = x.transpose(1, 2) |
| | | |
| | | xl = [] |
| | | for layer in self.blocks: |
| | | try: |
| | | x = layer(x, lengths=lengths) |
| | | except TypeError: |
| | | x = layer(x) |
| | | xl.append(x) |
| | | |
| | | # Multi-layer feature aggregation |
| | | x = torch.cat(xl[1:], dim=1) |
| | | x = self.mfa(x) |
| | | |
| | | if self.window_size is None: |
| | | # Attentive Statistical Pooling |
| | | x = self.asp(x, lengths=lengths) |
| | | x = self.asp_bn(x) |
| | | # Final linear transformation |
| | | x = self.fc(x) |
| | | # x = x.transpose(1, 2) |
| | | x = x.squeeze(2) # -> B, C |
| | | else: |
| | | x = self.windowed_pooling(x, lengths) |
| | | x = x.transpose(1, 2) # -> B, T, C |
| | | return x |
| | |
| | | cls, train: bool = True, inference: bool = False |
| | | ) -> Tuple[str, ...]: |
| | | if not inference: |
| | | retval = ("speech", "profile", "label") |
| | | retval = ("speech", "profile", "binary_labels") |
| | | else: |
| | | # Recognition mode |
| | | retval = ("speech", "profile") |