python/FunASR-XL.git

parent: 731bfcbc | 补丁 | 提交 | ignore whitespace

志浩

2023-02-23 04a7ce3205ca478fbc3b1415c2dc31a0769d051c

sond pipeline

2个文件已修改

3个文件已添加

	egs/mars/sd/scripts/convert_rttm_to_seg_file.py	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/diar_train.py	46 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_diar_sond.py	34 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/encoder/ecapa_tdnn_encoder.py	689 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/diar.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/mars/sd/scripts/convert_rttm_to_seg_file.py

New file
@@ -0,0 +1,57 @@
import numpy as np
from funasr.utils.job_runner import MultiProcessRunnerV3
from funasr.utils.misc import load_scp_as_list, load_scp_as_dict
import os
import argparse


class MyRunner(MultiProcessRunnerV3):

    def prepare(self, parser):
        parser.add_argument("--rttm_scp", type=str)
        parser.add_argument("--seg_file", type=str)
        args = parser.parse_args()

        if not os.path.exists(os.path.dirname(args.seg_file)):
            os.makedirs(os.path.dirname(args.seg_file))

        task_list = load_scp_as_list(args.rttm_scp)
        return task_list, None, args

    def post(self, results_list, args):
        with open(args.seg_file, "wt", encoding="utf-8") as fd:
            for results in results_list:
                fd.writelines(results)


def process(task_args):
    _, task_list, _, args = task_args
    outputs = []
    for mid, rttm_path in task_list:
        spk_turns = []
        length = 0
        for one_line in open(rttm_path, 'rt', encoding="utf-8"):
            parts = one_line.strip().split(" ")
            _, st, dur, spk_name = parts[1], float(parts[3]), float(parts[4]), parts[7]
            st, ed = int(st*100), int((st + dur)*100)
            length = ed if ed > length else length
            spk_turns.append([mid, st, ed, spk_name])
        is_sph = np.zeros((length+1, ), dtype=bool)
        for _, st, ed, _ in spk_turns:
            is_sph[st:ed] = True

        st, in_speech = 0, False
        for i in range(length+1):
            if not in_speech and is_sph[i]:
                st, in_speech = i, True
            if in_speech and not is_sph[i]:
                in_speech = False
                outputs.append("{}-{:07d}-{:07d} {} {:.2f} {:.2f}\n".format(
                    mid, st, i, mid, float(st)/100, float(i)/100
                ))
    return outputs


if __name__ == '__main__':
    my_runner = MyRunner(process)
    my_runner.run()

 funasr/bin/diar_train.py

New file
@@ -0,0 +1,46 @@
#!/usr/bin/env python3

import os

from funasr.tasks.diar import DiarTask


# for ASR Training
def parse_args():
    parser = DiarTask.get_parser()
    parser.add_argument(
        "--gpu_id",
        type=int,
        default=0,
        help="local gpu id.",
    )
    args = parser.parse_args()
    return args


def main(args=None, cmd=None):
    # for ASR Training
    DiarTask.main(args=args, cmd=cmd)


if __name__ == '__main__':
    args = parse_args()

    # setup local gpu_id
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)

    # DDP settings
    if args.ngpu > 1:
        args.distributed = True
    else:
        args.distributed = False
    assert args.num_worker_count == 1

    # re-compute batch size: when dataset type is small
    if args.dataset_type == "small":
        if args.batch_size is not None:
            args.batch_size = args.batch_size * args.ngpu
        if args.batch_bins is not None:
            args.batch_bins = args.batch_bins * args.ngpu

    main(args=args)

 funasr/models/e2e_diar_sond.py

@@ -86,6 +86,8 @@
        )
        self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
        self.pse_embedding = self.generate_pse_embedding()
        self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :])
        self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :])
        self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
        self.inter_score_loss_weight = inter_score_loss_weight

@@ -102,8 +104,8 @@
        speech_lengths: torch.Tensor = None,
        profile: torch.Tensor = None,
        profile_lengths: torch.Tensor = None,
        spk_labels: torch.Tensor = None,
        spk_labels_lengths: torch.Tensor = None,
        binary_labels: torch.Tensor = None,
        binary_labels_lengths: torch.Tensor = None,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Frontend + Encoder + Speaker Encoder + CI Scorer + CD Scorer + Decoder + Calc loss

@@ -116,10 +118,10 @@
                                     espnet2/iterators/chunk_iter_factory.py
            profile: (Batch, N_spk, dim)
            profile_lengths: (Batch,)
            spk_labels: (Batch, frames, input_size)
            spk_labels_lengths: (Batch,)
            binary_labels: (Batch, frames, max_spk_num)
            binary_labels_lengths: (Batch,)
        """
        assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
        assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
        batch_size = speech.shape[0]

        # 1. Network forward
@@ -132,23 +134,25 @@

        # 2. Aggregate time-domain labels to match forward outputs
        if self.label_aggregator is not None:
            spk_labels, spk_labels_lengths = self.label_aggregator(
                spk_labels.unsqueeze(2), spk_labels_lengths
            binary_labels, binary_labels_lengths = self.label_aggregator(
                binary_labels, binary_labels_lengths
            )
            spk_labels = spk_labels.squeeze(2)
        # 2. Calculate power-set encoding (PSE) labels
        raw_pse_labels = torch.sum(binary_labels * self.power_weight, dim=2, keepdim=True)
        pse_labels = torch.argmax(raw_pse_labels == self.int_token_arr, dim=2)

        # If encoder uses conv* as input_layer (i.e., subsampling),
        # the sequence length of 'pred' might be slightly less than the
        # length of 'spk_labels'. Here we force them to be equal.
        length_diff_tolerance = 2
        length_diff = spk_labels.shape[1] - pred.shape[1]
        length_diff = pse_labels.shape[1] - pred.shape[1]
        if 0 < length_diff <= length_diff_tolerance:
            spk_labels = spk_labels[:, 0: pred.shape[1], :]
            pse_labels = pse_labels[:, 0: pred.shape[1]]

        loss_diar = self.classification_loss(pred, spk_labels, spk_labels_lengths)
        loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
        loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
        loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, spk_labels, spk_labels_lengths)
        label_mask = make_pad_mask(spk_labels_lengths, maxlen=spk_labels.shape[1])
        loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, pse_labels, binary_labels_lengths)
        label_mask = make_pad_mask(binary_labels_lengths, maxlen=pse_labels.shape[1])
        loss = (loss_diar + self.speaker_discrimination_loss_weight * loss_spk_dis
                + self.inter_score_loss_weight * (loss_inter_ci + loss_inter_cd))

@@ -164,8 +168,8 @@
            speaker_error,
        ) = self.calc_diarization_error(
            pred=F.embedding(pred.argmax(dim=2) * label_mask, self.pse_embedding),
            label=F.embedding(spk_labels * label_mask, self.pse_embedding),
            length=spk_labels_lengths
            label=F.embedding(pse_labels * label_mask, self.pse_embedding),
            length=binary_labels_lengths
        )

        if speech_scored > 0 and num_frames > 0:

 funasr/models/encoder/ecapa_tdnn_encoder.py

New file
@@ -0,0 +1,689 @@
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class _BatchNorm1d(nn.Module):
    def __init__(
        self,
        input_shape=None,
        input_size=None,
        eps=1e-05,
        momentum=0.1,
        affine=True,
        track_running_stats=True,
        combine_batch_time=False,
        skip_transpose=False,
    ):
        super().__init__()
        self.combine_batch_time = combine_batch_time
        self.skip_transpose = skip_transpose

        if input_size is None and skip_transpose:
            input_size = input_shape[1]
        elif input_size is None:
            input_size = input_shape[-1]

        self.norm = nn.BatchNorm1d(
            input_size,
            eps=eps,
            momentum=momentum,
            affine=affine,
            track_running_stats=track_running_stats,
        )

    def forward(self, x):
        shape_or = x.shape
        if self.combine_batch_time:
            if x.ndim == 3:
                x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
            else:
                x = x.reshape(
                    shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
                )

        elif not self.skip_transpose:
            x = x.transpose(-1, 1)

        x_n = self.norm(x)

        if self.combine_batch_time:
            x_n = x_n.reshape(shape_or)
        elif not self.skip_transpose:
            x_n = x_n.transpose(1, -1)

        return x_n


class _Conv1d(nn.Module):
    def __init__(
        self,
        out_channels,
        kernel_size,
        input_shape=None,
        in_channels=None,
        stride=1,
        dilation=1,
        padding="same",
        groups=1,
        bias=True,
        padding_mode="reflect",
        skip_transpose=False,
    ):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.dilation = dilation
        self.padding = padding
        self.padding_mode = padding_mode
        self.unsqueeze = False
        self.skip_transpose = skip_transpose

        if input_shape is None and in_channels is None:
            raise ValueError("Must provide one of input_shape or in_channels")

        if in_channels is None:
            in_channels = self._check_input_shape(input_shape)

        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            self.kernel_size,
            stride=self.stride,
            dilation=self.dilation,
            padding=0,
            groups=groups,
            bias=bias,
        )

    def forward(self, x):
        if not self.skip_transpose:
            x = x.transpose(1, -1)

        if self.unsqueeze:
            x = x.unsqueeze(1)

        if self.padding == "same":
            x = self._manage_padding(
                x, self.kernel_size, self.dilation, self.stride
            )

        elif self.padding == "causal":
            num_pad = (self.kernel_size - 1) * self.dilation
            x = F.pad(x, (num_pad, 0))

        elif self.padding == "valid":
            pass

        else:
            raise ValueError(
                "Padding must be 'same', 'valid' or 'causal'. Got "
                + self.padding
            )

        wx = self.conv(x)

        if self.unsqueeze:
            wx = wx.squeeze(1)

        if not self.skip_transpose:
            wx = wx.transpose(1, -1)

        return wx

    def _manage_padding(
        self, x, kernel_size: int, dilation: int, stride: int,
    ):
        # Detecting input shape
        L_in = x.shape[-1]

        # Time padding
        padding = get_padding_elem(L_in, stride, kernel_size, dilation)

        # Applying padding
        x = F.pad(x, padding, mode=self.padding_mode)

        return x

    def _check_input_shape(self, shape):
        """Checks the input shape and returns the number of input channels.
        """

        if len(shape) == 2:
            self.unsqueeze = True
            in_channels = 1
        elif self.skip_transpose:
            in_channels = shape[1]
        elif len(shape) == 3:
            in_channels = shape[2]
        else:
            raise ValueError(
                "conv1d expects 2d, 3d inputs. Got " + str(len(shape))
            )

        # Kernel size must be odd
        if self.kernel_size % 2 == 0:
            raise ValueError(
                "The field kernel size must be an odd number. Got %s."
                % (self.kernel_size)
            )
        return in_channels


def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
    if stride > 1:
        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
        L_out = stride * (n_steps - 1) + kernel_size * dilation
        padding = [kernel_size // 2, kernel_size // 2]

    else:
        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1

        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
    return padding


# Skip transpose as much as possible for efficiency
class Conv1d(_Conv1d):
    def __init__(self, *args, **kwargs):
        super().__init__(skip_transpose=True, *args, **kwargs)


class BatchNorm1d(_BatchNorm1d):
    def __init__(self, *args, **kwargs):
        super().__init__(skip_transpose=True, *args, **kwargs)


def length_to_mask(length, max_len=None, dtype=None, device=None):
    assert len(length.shape) == 1

    if max_len is None:
        max_len = length.max().long().item()  # using arange to generate mask
    mask = torch.arange(
        max_len, device=length.device, dtype=length.dtype
    ).expand(len(length), max_len) < length.unsqueeze(1)

    if dtype is None:
        dtype = length.dtype

    if device is None:
        device = length.device

    mask = torch.as_tensor(mask, dtype=dtype, device=device)
    return mask


class TDNNBlock(nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        kernel_size,
        dilation,
        activation=nn.ReLU,
        groups=1,
    ):
        super(TDNNBlock, self).__init__()
        self.conv = Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            dilation=dilation,
            groups=groups,
        )
        self.activation = activation()
        self.norm = BatchNorm1d(input_size=out_channels)

    def forward(self, x):
        return self.norm(self.activation(self.conv(x)))


class Res2NetBlock(torch.nn.Module):
    """An implementation of Res2NetBlock w/ dilation.

    Arguments
    ---------
    in_channels : int
        The number of channels expected in the input.
    out_channels : int
        The number of output channels.
    scale : int
        The scale of the Res2Net block.
    kernel_size: int
        The kernel size of the Res2Net block.
    dilation : int
        The dilation of the Res2Net block.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
    >>> out_tensor = layer(inp_tensor).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 120, 64])
    """

    def __init__(
        self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1
    ):
        super(Res2NetBlock, self).__init__()
        assert in_channels % scale == 0
        assert out_channels % scale == 0

        in_channel = in_channels // scale
        hidden_channel = out_channels // scale

        self.blocks = nn.ModuleList(
            [
                TDNNBlock(
                    in_channel,
                    hidden_channel,
                    kernel_size=kernel_size,
                    dilation=dilation,
                )
                for i in range(scale - 1)
            ]
        )
        self.scale = scale

    def forward(self, x):
        y = []
        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
            if i == 0:
                y_i = x_i
            elif i == 1:
                y_i = self.blocks[i - 1](x_i)
            else:
                y_i = self.blocks[i - 1](x_i + y_i)
            y.append(y_i)
        y = torch.cat(y, dim=1)
        return y


class SEBlock(nn.Module):
    """An implementation of squeeze-and-excitation block.

    Arguments
    ---------
    in_channels : int
        The number of input channels.
    se_channels : int
        The number of output channels after squeeze.
    out_channels : int
        The number of output channels.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> se_layer = SEBlock(64, 16, 64)
    >>> lengths = torch.rand((8,))
    >>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 120, 64])
    """

    def __init__(self, in_channels, se_channels, out_channels):
        super(SEBlock, self).__init__()

        self.conv1 = Conv1d(
            in_channels=in_channels, out_channels=se_channels, kernel_size=1
        )
        self.relu = torch.nn.ReLU(inplace=True)
        self.conv2 = Conv1d(
            in_channels=se_channels, out_channels=out_channels, kernel_size=1
        )
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x, lengths=None):
        L = x.shape[-1]
        if lengths is not None:
            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
            mask = mask.unsqueeze(1)
            total = mask.sum(dim=2, keepdim=True)
            s = (x * mask).sum(dim=2, keepdim=True) / total
        else:
            s = x.mean(dim=2, keepdim=True)

        s = self.relu(self.conv1(s))
        s = self.sigmoid(self.conv2(s))

        return s * x


class AttentiveStatisticsPooling(nn.Module):
    """This class implements an attentive statistic pooling layer for each channel.
    It returns the concatenated mean and std of the input tensor.

    Arguments
    ---------
    channels: int
        The number of input channels.
    attention_channels: int
        The number of attention channels.

    Example
    -------
    >>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
    >>> asp_layer = AttentiveStatisticsPooling(64)
    >>> lengths = torch.rand((8,))
    >>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
    >>> out_tensor.shape
    torch.Size([8, 1, 128])
    """

    def __init__(self, channels, attention_channels=128, global_context=True):
        super().__init__()

        self.eps = 1e-12
        self.global_context = global_context
        if global_context:
            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
        else:
            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
        self.tanh = nn.Tanh()
        self.conv = Conv1d(
            in_channels=attention_channels, out_channels=channels, kernel_size=1
        )

    def forward(self, x, lengths=None):
        """Calculates mean and std for a batch (input tensor).

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape [N, C, L].
        """
        L = x.shape[-1]

        def _compute_statistics(x, m, dim=2, eps=self.eps):
            mean = (m * x).sum(dim)
            std = torch.sqrt(
                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
            )
            return mean, std

        if lengths is None:
            lengths = torch.ones(x.shape[0], device=x.device)

        # Make binary mask of shape [N, 1, L]
        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
        mask = mask.unsqueeze(1)

        # Expand the temporal context of the pooling layer by allowing the
        # self-attention to look at global properties of the utterance.
        if self.global_context:
            # torch.std is unstable for backward computation
            # https://github.com/pytorch/pytorch/issues/4320
            total = mask.sum(dim=2, keepdim=True).float()
            mean, std = _compute_statistics(x, mask / total)
            mean = mean.unsqueeze(2).repeat(1, 1, L)
            std = std.unsqueeze(2).repeat(1, 1, L)
            attn = torch.cat([x, mean, std], dim=1)
        else:
            attn = x

        # Apply layers
        attn = self.conv(self.tanh(self.tdnn(attn)))

        # Filter out zero-paddings
        attn = attn.masked_fill(mask == 0, float("-inf"))

        attn = F.softmax(attn, dim=2)
        mean, std = _compute_statistics(x, attn)
        # Append mean and std of the batch
        pooled_stats = torch.cat((mean, std), dim=1)
        pooled_stats = pooled_stats.unsqueeze(2)

        return pooled_stats


class SERes2NetBlock(nn.Module):
    """An implementation of building block in ECAPA-TDNN, i.e.,
    TDNN-Res2Net-TDNN-SEBlock.

    Arguments
    ----------
    out_channels: int
        The number of output channels.
    res2net_scale: int
        The scale of the Res2Net block.
    kernel_size: int
        The kernel size of the TDNN blocks.
    dilation: int
        The dilation of the Res2Net block.
    activation : torch class
        A class for constructing the activation layers.
    groups: int
    Number of blocked connections from input channels to output channels.

    Example
    -------
    >>> x = torch.rand(8, 120, 64).transpose(1, 2)
    >>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
    >>> out = conv(x).transpose(1, 2)
    >>> out.shape
    torch.Size([8, 120, 64])
    """

    def __init__(
        self,
        in_channels,
        out_channels,
        res2net_scale=8,
        se_channels=128,
        kernel_size=1,
        dilation=1,
        activation=torch.nn.ReLU,
        groups=1,
    ):
        super().__init__()
        self.out_channels = out_channels
        self.tdnn1 = TDNNBlock(
            in_channels,
            out_channels,
            kernel_size=1,
            dilation=1,
            activation=activation,
            groups=groups,
        )
        self.res2net_block = Res2NetBlock(
            out_channels, out_channels, res2net_scale, kernel_size, dilation
        )
        self.tdnn2 = TDNNBlock(
            out_channels,
            out_channels,
            kernel_size=1,
            dilation=1,
            activation=activation,
            groups=groups,
        )
        self.se_block = SEBlock(out_channels, se_channels, out_channels)

        self.shortcut = None
        if in_channels != out_channels:
            self.shortcut = Conv1d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=1,
            )

    def forward(self, x, lengths=None):
        residual = x
        if self.shortcut:
            residual = self.shortcut(x)

        x = self.tdnn1(x)
        x = self.res2net_block(x)
        x = self.tdnn2(x)
        x = self.se_block(x, lengths)

        return x + residual


class ECAPA_TDNN(torch.nn.Module):
    """An implementation of the speaker embedding model in a paper.
    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).

    Arguments
    ---------
    device : str
        Device used, e.g., "cpu" or "cuda".
    activation : torch class
        A class for constructing the activation layers.
    channels : list of ints
        Output channels for TDNN/SERes2Net layer.
    kernel_sizes : list of ints
        List of kernel sizes for each layer.
    dilations : list of ints
        List of dilations for kernels in each layer.
    lin_neurons : int
        Number of neurons in linear layers.
    groups : list of ints
        List of groups for kernels in each layer.

    Example
    -------
    >>> input_feats = torch.rand([5, 120, 80])
    >>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
    >>> outputs = compute_embedding(input_feats)
    >>> outputs.shape
    torch.Size([5, 1, 192])
    """

    def __init__(
        self,
        input_size,
        device="cpu",
        lin_neurons=192,
        activation=torch.nn.ReLU,
        channels=[512, 512, 512, 512, 1536],
        kernel_sizes=[5, 3, 3, 3, 1],
        dilations=[1, 2, 3, 4, 1],
        attention_channels=128,
        res2net_scale=8,
        se_channels=128,
        global_context=True,
        groups=[1, 1, 1, 1, 1],
        window_size=20,
        window_shift=1,
    ):

        super().__init__()
        assert len(channels) == len(kernel_sizes)
        assert len(channels) == len(dilations)
        self.channels = channels
        self.blocks = nn.ModuleList()
        self.window_size = window_size
        self.window_shift = window_shift

        # The initial TDNN layer
        self.blocks.append(
            TDNNBlock(
                input_size,
                channels[0],
                kernel_sizes[0],
                dilations[0],
                activation,
                groups[0],
            )
        )

        # SE-Res2Net layers
        for i in range(1, len(channels) - 1):
            self.blocks.append(
                SERes2NetBlock(
                    channels[i - 1],
                    channels[i],
                    res2net_scale=res2net_scale,
                    se_channels=se_channels,
                    kernel_size=kernel_sizes[i],
                    dilation=dilations[i],
                    activation=activation,
                    groups=groups[i],
                )
            )

        # Multi-layer feature aggregation
        self.mfa = TDNNBlock(
            channels[-1],
            channels[-1],
            kernel_sizes[-1],
            dilations[-1],
            activation,
            groups=groups[-1],
        )

        # Attentive Statistical Pooling
        self.asp = AttentiveStatisticsPooling(
            channels[-1],
            attention_channels=attention_channels,
            global_context=global_context,
        )
        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)

        # Final linear transformation
        self.fc = Conv1d(
            in_channels=channels[-1] * 2,
            out_channels=lin_neurons,
            kernel_size=1,
        )

    def windowed_pooling(self, x, lengths=None):
        # x: Batch, Channel, Time
        tt = x.shape[2]
        num_chunk = int(math.ceil(tt / self.window_shift))
        pad = self.window_size // 2
        x = F.pad(x, (pad, pad, 0, 0), "reflect")
        stat_list = []

        for i in range(num_chunk):
            # B x C
            st, ed = i * self.window_shift, i * self.window_shift + self.window_size
            x = self.asp(x[:, :, st: ed],
                         lengths=torch.clamp(lengths - i, 0, self.window_size)
                         if lengths is not None else None)
            x = self.asp_bn(x)
            x = self.fc(x)
            stat_list.append(x)

        return torch.cat(stat_list, dim=2)

    def forward(self, x, lengths=None):
        """Returns the embedding vector.

        Arguments
        ---------
        x : torch.Tensor
            Tensor of shape (batch, time, channel).
        lengths: torch.Tensor
            Tensor of shape (batch, )
        """
        # Minimize transpose for efficiency
        x = x.transpose(1, 2)

        xl = []
        for layer in self.blocks:
            try:
                x = layer(x, lengths=lengths)
            except TypeError:
                x = layer(x)
            xl.append(x)

        # Multi-layer feature aggregation
        x = torch.cat(xl[1:], dim=1)
        x = self.mfa(x)

        if self.window_size is None:
            # Attentive Statistical Pooling
            x = self.asp(x, lengths=lengths)
            x = self.asp_bn(x)
            # Final linear transformation
            x = self.fc(x)
            # x = x.transpose(1, 2)
            x = x.squeeze(2)  # -> B, C
        else:
            x = self.windowed_pooling(x, lengths)
            x = x.transpose(1, 2)  # -> B, T, C
        return x

 funasr/tasks/diar.py

@@ -368,7 +368,7 @@
            cls, train: bool = True, inference: bool = False
    ) -> Tuple[str, ...]:
        if not inference:
            retval = ("speech", "profile", "label")
            retval = ("speech", "profile", "binary_labels")
        else:
            # Recognition mode
            retval = ("speech", "profile")

New file
			@@ -0,0 +1,57 @@
			import numpy as np
			from funasr.utils.job_runner import MultiProcessRunnerV3
			from funasr.utils.misc import load_scp_as_list, load_scp_as_dict
			import os
			import argparse


			class MyRunner(MultiProcessRunnerV3):

			def prepare(self, parser):
			parser.add_argument("--rttm_scp", type=str)
			parser.add_argument("--seg_file", type=str)
			args = parser.parse_args()

			if not os.path.exists(os.path.dirname(args.seg_file)):
			os.makedirs(os.path.dirname(args.seg_file))

			task_list = load_scp_as_list(args.rttm_scp)
			return task_list, None, args

			def post(self, results_list, args):
			with open(args.seg_file, "wt", encoding="utf-8") as fd:
			for results in results_list:
			fd.writelines(results)


			def process(task_args):
			_, task_list, _, args = task_args
			outputs = []
			for mid, rttm_path in task_list:
			spk_turns = []
			length = 0
			for one_line in open(rttm_path, 'rt', encoding="utf-8"):
			parts = one_line.strip().split(" ")
			_, st, dur, spk_name = parts[1], float(parts[3]), float(parts[4]), parts[7]
			st, ed = int(st100), int((st + dur)100)
			length = ed if ed > length else length
			spk_turns.append([mid, st, ed, spk_name])
			is_sph = np.zeros((length+1, ), dtype=bool)
			for _, st, ed, _ in spk_turns:
			is_sph[st:ed] = True

			st, in_speech = 0, False
			for i in range(length+1):
			if not in_speech and is_sph[i]:
			st, in_speech = i, True
			if in_speech and not is_sph[i]:
			in_speech = False
			outputs.append("{}-{:07d}-{:07d} {} {:.2f} {:.2f}\n".format(
			mid, st, i, mid, float(st)/100, float(i)/100
			))
			return outputs


			if __name__ == '__main__':
			my_runner = MyRunner(process)
			my_runner.run()

New file
			@@ -0,0 +1,46 @@
			#!/usr/bin/env python3

			import os

			from funasr.tasks.diar import DiarTask


			# for ASR Training
			def parse_args():
			parser = DiarTask.get_parser()
			parser.add_argument(
			"--gpu_id",
			type=int,
			default=0,
			help="local gpu id.",
			)
			args = parser.parse_args()
			return args


			def main(args=None, cmd=None):
			# for ASR Training
			DiarTask.main(args=args, cmd=cmd)


			if __name__ == '__main__':
			args = parse_args()

			# setup local gpu_id
			os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)

			# DDP settings
			if args.ngpu > 1:
			args.distributed = True
			else:
			args.distributed = False
			assert args.num_worker_count == 1

			# re-compute batch size: when dataset type is small
			if args.dataset_type == "small":
			if args.batch_size is not None:
			args.batch_size = args.batch_size * args.ngpu
			if args.batch_bins is not None:
			args.batch_bins = args.batch_bins * args.ngpu

			main(args=args)

			@@ -86,6 +86,8 @@
			)
			self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
			self.pse_embedding = self.generate_pse_embedding()
			self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :])
			self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :])
			self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
			self.inter_score_loss_weight = inter_score_loss_weight

			@@ -102,8 +104,8 @@
			speech_lengths: torch.Tensor = None,
			profile: torch.Tensor = None,
			profile_lengths: torch.Tensor = None,
			spk_labels: torch.Tensor = None,
			spk_labels_lengths: torch.Tensor = None,
			binary_labels: torch.Tensor = None,
			binary_labels_lengths: torch.Tensor = None,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Speaker Encoder + CI Scorer + CD Scorer + Decoder + Calc loss

			@@ -116,10 +118,10 @@
			espnet2/iterators/chunk_iter_factory.py
			profile: (Batch, N_spk, dim)
			profile_lengths: (Batch,)
			spk_labels: (Batch, frames, input_size)
			spk_labels_lengths: (Batch,)
			binary_labels: (Batch, frames, max_spk_num)
			binary_labels_lengths: (Batch,)
			"""
			assert speech.shape[0] == spk_labels.shape[0], (speech.shape, spk_labels.shape)
			assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
			batch_size = speech.shape[0]

			# 1. Network forward
			@@ -132,23 +134,25 @@

			# 2. Aggregate time-domain labels to match forward outputs
			if self.label_aggregator is not None:
			spk_labels, spk_labels_lengths = self.label_aggregator(
			spk_labels.unsqueeze(2), spk_labels_lengths
			binary_labels, binary_labels_lengths = self.label_aggregator(
			binary_labels, binary_labels_lengths
			)
			spk_labels = spk_labels.squeeze(2)
			# 2. Calculate power-set encoding (PSE) labels
			raw_pse_labels = torch.sum(binary_labels * self.power_weight, dim=2, keepdim=True)
			pse_labels = torch.argmax(raw_pse_labels == self.int_token_arr, dim=2)

			# If encoder uses conv* as input_layer (i.e., subsampling),
			# the sequence length of 'pred' might be slightly less than the
			# length of 'spk_labels'. Here we force them to be equal.
			length_diff_tolerance = 2
			length_diff = spk_labels.shape[1] - pred.shape[1]
			length_diff = pse_labels.shape[1] - pred.shape[1]
			if 0 < length_diff <= length_diff_tolerance:
			spk_labels = spk_labels[:, 0: pred.shape[1], :]
			pse_labels = pse_labels[:, 0: pred.shape[1]]

			loss_diar = self.classification_loss(pred, spk_labels, spk_labels_lengths)
			loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
			loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
			loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, spk_labels, spk_labels_lengths)
			label_mask = make_pad_mask(spk_labels_lengths, maxlen=spk_labels.shape[1])
			loss_inter_ci, loss_inter_cd = self.internal_score_loss(cd_score, ci_score, pse_labels, binary_labels_lengths)
			label_mask = make_pad_mask(binary_labels_lengths, maxlen=pse_labels.shape[1])
			loss = (loss_diar + self.speaker_discrimination_loss_weight * loss_spk_dis
			+ self.inter_score_loss_weight * (loss_inter_ci + loss_inter_cd))

			@@ -164,8 +168,8 @@
			speaker_error,
			) = self.calc_diarization_error(
			pred=F.embedding(pred.argmax(dim=2) * label_mask, self.pse_embedding),
			label=F.embedding(spk_labels * label_mask, self.pse_embedding),
			length=spk_labels_lengths
			label=F.embedding(pse_labels * label_mask, self.pse_embedding),
			length=binary_labels_lengths
			)

			if speech_scored > 0 and num_frames > 0:

New file
			@@ -0,0 +1,689 @@
			import math
			import torch
			import torch.nn as nn
			import torch.nn.functional as F


			class _BatchNorm1d(nn.Module):
			def __init__(
			self,
			input_shape=None,
			input_size=None,
			eps=1e-05,
			momentum=0.1,
			affine=True,
			track_running_stats=True,
			combine_batch_time=False,
			skip_transpose=False,
			):
			super().__init__()
			self.combine_batch_time = combine_batch_time
			self.skip_transpose = skip_transpose

			if input_size is None and skip_transpose:
			input_size = input_shape[1]
			elif input_size is None:
			input_size = input_shape[-1]

			self.norm = nn.BatchNorm1d(
			input_size,
			eps=eps,
			momentum=momentum,
			affine=affine,
			track_running_stats=track_running_stats,
			)

			def forward(self, x):
			shape_or = x.shape
			if self.combine_batch_time:
			if x.ndim == 3:
			x = x.reshape(shape_or[0] * shape_or[1], shape_or[2])
			else:
			x = x.reshape(
			shape_or[0] * shape_or[1], shape_or[3], shape_or[2]
			)

			elif not self.skip_transpose:
			x = x.transpose(-1, 1)

			x_n = self.norm(x)

			if self.combine_batch_time:
			x_n = x_n.reshape(shape_or)
			elif not self.skip_transpose:
			x_n = x_n.transpose(1, -1)

			return x_n


			class _Conv1d(nn.Module):
			def __init__(
			self,
			out_channels,
			kernel_size,
			input_shape=None,
			in_channels=None,
			stride=1,
			dilation=1,
			padding="same",
			groups=1,
			bias=True,
			padding_mode="reflect",
			skip_transpose=False,
			):
			super().__init__()
			self.kernel_size = kernel_size
			self.stride = stride
			self.dilation = dilation
			self.padding = padding
			self.padding_mode = padding_mode
			self.unsqueeze = False
			self.skip_transpose = skip_transpose

			if input_shape is None and in_channels is None:
			raise ValueError("Must provide one of input_shape or in_channels")

			if in_channels is None:
			in_channels = self._check_input_shape(input_shape)

			self.conv = nn.Conv1d(
			in_channels,
			out_channels,
			self.kernel_size,
			stride=self.stride,
			dilation=self.dilation,
			padding=0,
			groups=groups,
			bias=bias,
			)

			def forward(self, x):
			if not self.skip_transpose:
			x = x.transpose(1, -1)

			if self.unsqueeze:
			x = x.unsqueeze(1)

			if self.padding == "same":
			x = self._manage_padding(
			x, self.kernel_size, self.dilation, self.stride
			)

			elif self.padding == "causal":
			num_pad = (self.kernel_size - 1) * self.dilation
			x = F.pad(x, (num_pad, 0))

			elif self.padding == "valid":
			pass

			else:
			raise ValueError(
			"Padding must be 'same', 'valid' or 'causal'. Got "
			+ self.padding
			)

			wx = self.conv(x)

			if self.unsqueeze:
			wx = wx.squeeze(1)

			if not self.skip_transpose:
			wx = wx.transpose(1, -1)

			return wx

			def _manage_padding(
			self, x, kernel_size: int, dilation: int, stride: int,
			):
			# Detecting input shape
			L_in = x.shape[-1]

			# Time padding
			padding = get_padding_elem(L_in, stride, kernel_size, dilation)

			# Applying padding
			x = F.pad(x, padding, mode=self.padding_mode)

			return x

			def _check_input_shape(self, shape):
			"""Checks the input shape and returns the number of input channels.
			"""

			if len(shape) == 2:
			self.unsqueeze = True
			in_channels = 1
			elif self.skip_transpose:
			in_channels = shape[1]
			elif len(shape) == 3:
			in_channels = shape[2]
			else:
			raise ValueError(
			"conv1d expects 2d, 3d inputs. Got " + str(len(shape))
			)

			# Kernel size must be odd
			if self.kernel_size % 2 == 0:
			raise ValueError(
			"The field kernel size must be an odd number. Got %s."
			% (self.kernel_size)
			)
			return in_channels


			def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
			if stride > 1:
			n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
			L_out = stride * (n_steps - 1) + kernel_size * dilation
			padding = [kernel_size // 2, kernel_size // 2]

			else:
			L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1

			padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
			return padding


			# Skip transpose as much as possible for efficiency
			class Conv1d(_Conv1d):
			def __init__(self, args, *kwargs):
			super().__init__(skip_transpose=True, args, *kwargs)


			class BatchNorm1d(_BatchNorm1d):
			def __init__(self, args, *kwargs):
			super().__init__(skip_transpose=True, args, *kwargs)


			def length_to_mask(length, max_len=None, dtype=None, device=None):
			assert len(length.shape) == 1

			if max_len is None:
			max_len = length.max().long().item() # using arange to generate mask
			mask = torch.arange(
			max_len, device=length.device, dtype=length.dtype
			).expand(len(length), max_len) < length.unsqueeze(1)

			if dtype is None:
			dtype = length.dtype

			if device is None:
			device = length.device

			mask = torch.as_tensor(mask, dtype=dtype, device=device)
			return mask


			class TDNNBlock(nn.Module):
			def __init__(
			self,
			in_channels,
			out_channels,
			kernel_size,
			dilation,
			activation=nn.ReLU,
			groups=1,
			):
			super(TDNNBlock, self).__init__()
			self.conv = Conv1d(
			in_channels=in_channels,
			out_channels=out_channels,
			kernel_size=kernel_size,
			dilation=dilation,
			groups=groups,
			)
			self.activation = activation()
			self.norm = BatchNorm1d(input_size=out_channels)

			def forward(self, x):
			return self.norm(self.activation(self.conv(x)))


			class Res2NetBlock(torch.nn.Module):
			"""An implementation of Res2NetBlock w/ dilation.

			Arguments
			---------
			in_channels : int
			The number of channels expected in the input.
			out_channels : int
			The number of output channels.
			scale : int
			The scale of the Res2Net block.
			kernel_size: int
			The kernel size of the Res2Net block.
			dilation : int
			The dilation of the Res2Net block.

			Example
			-------
			>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			>>> layer = Res2NetBlock(64, 64, scale=4, dilation=3)
			>>> out_tensor = layer(inp_tensor).transpose(1, 2)
			>>> out_tensor.shape
			torch.Size([8, 120, 64])
			"""

			def __init__(
			self, in_channels, out_channels, scale=8, kernel_size=3, dilation=1
			):
			super(Res2NetBlock, self).__init__()
			assert in_channels % scale == 0
			assert out_channels % scale == 0

			in_channel = in_channels // scale
			hidden_channel = out_channels // scale

			self.blocks = nn.ModuleList(
			[
			TDNNBlock(
			in_channel,
			hidden_channel,
			kernel_size=kernel_size,
			dilation=dilation,
			)
			for i in range(scale - 1)
			]
			)
			self.scale = scale

			def forward(self, x):
			y = []
			for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
			if i == 0:
			y_i = x_i
			elif i == 1:
			y_i = self.blocks[i - 1](x_i)
			else:
			y_i = self.blocks[i - 1](x_i + y_i)
			y.append(y_i)
			y = torch.cat(y, dim=1)
			return y


			class SEBlock(nn.Module):
			"""An implementation of squeeze-and-excitation block.

			Arguments
			---------
			in_channels : int
			The number of input channels.
			se_channels : int
			The number of output channels after squeeze.
			out_channels : int
			The number of output channels.

			Example
			-------
			>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			>>> se_layer = SEBlock(64, 16, 64)
			>>> lengths = torch.rand((8,))
			>>> out_tensor = se_layer(inp_tensor, lengths).transpose(1, 2)
			>>> out_tensor.shape
			torch.Size([8, 120, 64])
			"""

			def __init__(self, in_channels, se_channels, out_channels):
			super(SEBlock, self).__init__()

			self.conv1 = Conv1d(
			in_channels=in_channels, out_channels=se_channels, kernel_size=1
			)
			self.relu = torch.nn.ReLU(inplace=True)
			self.conv2 = Conv1d(
			in_channels=se_channels, out_channels=out_channels, kernel_size=1
			)
			self.sigmoid = torch.nn.Sigmoid()

			def forward(self, x, lengths=None):
			L = x.shape[-1]
			if lengths is not None:
			mask = length_to_mask(lengths * L, max_len=L, device=x.device)
			mask = mask.unsqueeze(1)
			total = mask.sum(dim=2, keepdim=True)
			s = (x * mask).sum(dim=2, keepdim=True) / total
			else:
			s = x.mean(dim=2, keepdim=True)

			s = self.relu(self.conv1(s))
			s = self.sigmoid(self.conv2(s))

			return s * x


			class AttentiveStatisticsPooling(nn.Module):
			"""This class implements an attentive statistic pooling layer for each channel.
			It returns the concatenated mean and std of the input tensor.

			Arguments
			---------
			channels: int
			The number of input channels.
			attention_channels: int
			The number of attention channels.

			Example
			-------
			>>> inp_tensor = torch.rand([8, 120, 64]).transpose(1, 2)
			>>> asp_layer = AttentiveStatisticsPooling(64)
			>>> lengths = torch.rand((8,))
			>>> out_tensor = asp_layer(inp_tensor, lengths).transpose(1, 2)
			>>> out_tensor.shape
			torch.Size([8, 1, 128])
			"""

			def __init__(self, channels, attention_channels=128, global_context=True):
			super().__init__()

			self.eps = 1e-12
			self.global_context = global_context
			if global_context:
			self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
			else:
			self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
			self.tanh = nn.Tanh()
			self.conv = Conv1d(
			in_channels=attention_channels, out_channels=channels, kernel_size=1
			)

			def forward(self, x, lengths=None):
			"""Calculates mean and std for a batch (input tensor).

			Arguments
			---------
			x : torch.Tensor
			Tensor of shape [N, C, L].
			"""
			L = x.shape[-1]

			def _compute_statistics(x, m, dim=2, eps=self.eps):
			mean = (m * x).sum(dim)
			std = torch.sqrt(
			(m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)
			)
			return mean, std

			if lengths is None:
			lengths = torch.ones(x.shape[0], device=x.device)

			# Make binary mask of shape [N, 1, L]
			mask = length_to_mask(lengths * L, max_len=L, device=x.device)
			mask = mask.unsqueeze(1)

			# Expand the temporal context of the pooling layer by allowing the
			# self-attention to look at global properties of the utterance.
			if self.global_context:
			# torch.std is unstable for backward computation
			# https://github.com/pytorch/pytorch/issues/4320
			total = mask.sum(dim=2, keepdim=True).float()
			mean, std = _compute_statistics(x, mask / total)
			mean = mean.unsqueeze(2).repeat(1, 1, L)
			std = std.unsqueeze(2).repeat(1, 1, L)
			attn = torch.cat([x, mean, std], dim=1)
			else:
			attn = x

			# Apply layers
			attn = self.conv(self.tanh(self.tdnn(attn)))

			# Filter out zero-paddings
			attn = attn.masked_fill(mask == 0, float("-inf"))

			attn = F.softmax(attn, dim=2)
			mean, std = _compute_statistics(x, attn)
			# Append mean and std of the batch
			pooled_stats = torch.cat((mean, std), dim=1)
			pooled_stats = pooled_stats.unsqueeze(2)

			return pooled_stats


			class SERes2NetBlock(nn.Module):
			"""An implementation of building block in ECAPA-TDNN, i.e.,
			TDNN-Res2Net-TDNN-SEBlock.

			Arguments
			----------
			out_channels: int
			The number of output channels.
			res2net_scale: int
			The scale of the Res2Net block.
			kernel_size: int
			The kernel size of the TDNN blocks.
			dilation: int
			The dilation of the Res2Net block.
			activation : torch class
			A class for constructing the activation layers.
			groups: int
			Number of blocked connections from input channels to output channels.

			Example
			-------
			>>> x = torch.rand(8, 120, 64).transpose(1, 2)
			>>> conv = SERes2NetBlock(64, 64, res2net_scale=4)
			>>> out = conv(x).transpose(1, 2)
			>>> out.shape
			torch.Size([8, 120, 64])
			"""

			def __init__(
			self,
			in_channels,
			out_channels,
			res2net_scale=8,
			se_channels=128,
			kernel_size=1,
			dilation=1,
			activation=torch.nn.ReLU,
			groups=1,
			):
			super().__init__()
			self.out_channels = out_channels
			self.tdnn1 = TDNNBlock(
			in_channels,
			out_channels,
			kernel_size=1,
			dilation=1,
			activation=activation,
			groups=groups,
			)
			self.res2net_block = Res2NetBlock(
			out_channels, out_channels, res2net_scale, kernel_size, dilation
			)
			self.tdnn2 = TDNNBlock(
			out_channels,
			out_channels,
			kernel_size=1,
			dilation=1,
			activation=activation,
			groups=groups,
			)
			self.se_block = SEBlock(out_channels, se_channels, out_channels)

			self.shortcut = None
			if in_channels != out_channels:
			self.shortcut = Conv1d(
			in_channels=in_channels,
			out_channels=out_channels,
			kernel_size=1,
			)

			def forward(self, x, lengths=None):
			residual = x
			if self.shortcut:
			residual = self.shortcut(x)

			x = self.tdnn1(x)
			x = self.res2net_block(x)
			x = self.tdnn2(x)
			x = self.se_block(x, lengths)

			return x + residual


			class ECAPA_TDNN(torch.nn.Module):
			"""An implementation of the speaker embedding model in a paper.
			"ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
			TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).

			Arguments
			---------
			device : str
			Device used, e.g., "cpu" or "cuda".
			activation : torch class
			A class for constructing the activation layers.
			channels : list of ints
			Output channels for TDNN/SERes2Net layer.
			kernel_sizes : list of ints
			List of kernel sizes for each layer.
			dilations : list of ints
			List of dilations for kernels in each layer.
			lin_neurons : int
			Number of neurons in linear layers.
			groups : list of ints
			List of groups for kernels in each layer.

			Example
			-------
			>>> input_feats = torch.rand([5, 120, 80])
			>>> compute_embedding = ECAPA_TDNN(80, lin_neurons=192)
			>>> outputs = compute_embedding(input_feats)
			>>> outputs.shape
			torch.Size([5, 1, 192])
			"""

			def __init__(
			self,
			input_size,
			device="cpu",
			lin_neurons=192,
			activation=torch.nn.ReLU,
			channels=[512, 512, 512, 512, 1536],
			kernel_sizes=[5, 3, 3, 3, 1],
			dilations=[1, 2, 3, 4, 1],
			attention_channels=128,
			res2net_scale=8,
			se_channels=128,
			global_context=True,
			groups=[1, 1, 1, 1, 1],
			window_size=20,
			window_shift=1,
			):

			super().__init__()
			assert len(channels) == len(kernel_sizes)
			assert len(channels) == len(dilations)
			self.channels = channels
			self.blocks = nn.ModuleList()
			self.window_size = window_size
			self.window_shift = window_shift

			# The initial TDNN layer
			self.blocks.append(
			TDNNBlock(
			input_size,
			channels[0],
			kernel_sizes[0],
			dilations[0],
			activation,
			groups[0],
			)
			)

			# SE-Res2Net layers
			for i in range(1, len(channels) - 1):
			self.blocks.append(
			SERes2NetBlock(
			channels[i - 1],
			channels[i],
			res2net_scale=res2net_scale,
			se_channels=se_channels,
			kernel_size=kernel_sizes[i],
			dilation=dilations[i],
			activation=activation,
			groups=groups[i],
			)
			)

			# Multi-layer feature aggregation
			self.mfa = TDNNBlock(
			channels[-1],
			channels[-1],
			kernel_sizes[-1],
			dilations[-1],
			activation,
			groups=groups[-1],
			)

			# Attentive Statistical Pooling
			self.asp = AttentiveStatisticsPooling(
			channels[-1],
			attention_channels=attention_channels,
			global_context=global_context,
			)
			self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)

			# Final linear transformation
			self.fc = Conv1d(
			in_channels=channels[-1] * 2,
			out_channels=lin_neurons,
			kernel_size=1,
			)

			def windowed_pooling(self, x, lengths=None):
			# x: Batch, Channel, Time
			tt = x.shape[2]
			num_chunk = int(math.ceil(tt / self.window_shift))
			pad = self.window_size // 2
			x = F.pad(x, (pad, pad, 0, 0), "reflect")
			stat_list = []

			for i in range(num_chunk):
			# B x C
			st, ed = i * self.window_shift, i * self.window_shift + self.window_size
			x = self.asp(x[:, :, st: ed],
			lengths=torch.clamp(lengths - i, 0, self.window_size)
			if lengths is not None else None)
			x = self.asp_bn(x)
			x = self.fc(x)
			stat_list.append(x)

			return torch.cat(stat_list, dim=2)

			def forward(self, x, lengths=None):
			"""Returns the embedding vector.

			Arguments
			---------
			x : torch.Tensor
			Tensor of shape (batch, time, channel).
			lengths: torch.Tensor
			Tensor of shape (batch, )
			"""
			# Minimize transpose for efficiency
			x = x.transpose(1, 2)

			xl = []
			for layer in self.blocks:
			try:
			x = layer(x, lengths=lengths)
			except TypeError:
			x = layer(x)
			xl.append(x)

			# Multi-layer feature aggregation
			x = torch.cat(xl[1:], dim=1)
			x = self.mfa(x)

			if self.window_size is None:
			# Attentive Statistical Pooling
			x = self.asp(x, lengths=lengths)
			x = self.asp_bn(x)
			# Final linear transformation
			x = self.fc(x)
			# x = x.transpose(1, 2)
			x = x.squeeze(2) # -> B, C
			else:
			x = self.windowed_pooling(x, lengths)
			x = x.transpose(1, 2) # -> B, T, C
			return x

			@@ -368,7 +368,7 @@
			cls, train: bool = True, inference: bool = False
			) -> Tuple[str, ...]:
			if not inference:
			retval = ("speech", "profile", "label")
			retval = ("speech", "profile", "binary_labels")
			else:
			# Recognition mode
			retval = ("speech", "profile")