| | |
| | | |
| | | class AudioEncoder(ModalitySpecificEncoder): |
| | | |
| | | |
| | | def __init__( |
| | | self, |
| | | modality_cfg, |
| | |
| | | ) |
| | | |
| | | decoder = ( |
| | | Decoder1d(modality_cfg.decoder, embed_dim) |
| | | if modality_cfg.decoder is not None |
| | | else None |
| | | Decoder1d(modality_cfg.decoder, embed_dim) if modality_cfg.decoder is not None else None |
| | | ) |
| | | |
| | | alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) |
| | |
| | | output_lengths - 1, |
| | | ) |
| | | ] = 1 |
| | | padding_mask = ( |
| | | 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1]) |
| | | ).bool() |
| | | padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() |
| | | else: |
| | | padding_mask = torch.zeros( |
| | | x.shape[:2], dtype=torch.bool, device=x.device |
| | | ) |
| | | padding_mask = torch.zeros(x.shape[:2], dtype=torch.bool, device=x.device) |
| | | |
| | | return padding_mask |
| | | |