| | |
| | | mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) |
| | | min_value = torch.finfo(scores.dtype).min |
| | | scores = scores.masked_fill(mask, min_value) |
| | | self.attn = torch.softmax(scores, dim=-1).masked_fill( |
| | | attn = torch.softmax(scores, dim=-1).masked_fill( |
| | | mask, 0.0 |
| | | ) # (batch, head, time1, time2) |
| | | else: |
| | | self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) |
| | | attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) |
| | | |
| | | p_attn = self.dropout(self.attn) |
| | | p_attn = self.dropout(attn) |
| | | x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) |
| | | x = ( |
| | | x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) |
| | | ) # (batch, time1, d_model) |
| | | |
| | | return self.linear_out(x), self.attn # (batch, time1, d_model) |
| | | return self.linear_out(x), attn # (batch, time1, d_model) |
| | | |
| | | def forward(self, query, key, value, mask): |
| | | """Compute scaled dot product attention. |