| | |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | import funasr.models.lora.layers as lora |
| | | |
| | | |
| | | class MultiHeadedAttention(nn.Module): |
| | | """Multi-Head Attention layer. |
| | | |
| | |
| | | n_batch = value.size(0) |
| | | if mask is not None: |
| | | mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) |
| | | min_value = float( |
| | | numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min |
| | | ) |
| | | min_value = float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min) |
| | | scores = scores.masked_fill(mask, min_value) |
| | | self.attn = torch.softmax(scores, dim=-1).masked_fill( |
| | | mask, 0.0 |
| | |
| | | matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) |
| | | matrix_bd = self.rel_shift(matrix_bd) |
| | | |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt( |
| | | self.d_k |
| | | ) # (batch, head, time1, time2) |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) |
| | | |
| | | return self.forward_attention(v, scores, mask) |
| | | |
| | |
| | | matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) |
| | | matrix_bd = self.rel_shift(matrix_bd) |
| | | |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt( |
| | | self.d_k |
| | | ) # (batch, head, time1, time2) |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) |
| | | |
| | | return self.forward_attention(v, scores, mask) |
| | | |
| | |
| | | matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) |
| | | matrix_bd = self.rel_shift(matrix_bd) |
| | | |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt( |
| | | self.d_k |
| | | ) # (batch, head, time1, time2) |
| | | scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k) # (batch, head, time1, time2) |
| | | |
| | | return self.forward_attention(v, scores, mask) |
| | | |
| | |
| | | """ |
| | | n_batch = query.size(0) |
| | | |
| | | q = ( |
| | | self.linear_q(query) |
| | | .view(n_batch, -1, self.num_heads, self.d_k) |
| | | .transpose(1, 2) |
| | | ) |
| | | k = ( |
| | | self.linear_k(key) |
| | | .view(n_batch, -1, self.num_heads, self.d_k) |
| | | .transpose(1, 2) |
| | | ) |
| | | v = ( |
| | | self.linear_v(value) |
| | | .view(n_batch, -1, self.num_heads, self.d_k) |
| | | .transpose(1, 2) |
| | | ) |
| | | q = self.linear_q(query).view(n_batch, -1, self.num_heads, self.d_k).transpose(1, 2) |
| | | k = self.linear_k(key).view(n_batch, -1, self.num_heads, self.d_k).transpose(1, 2) |
| | | v = self.linear_v(value).view(n_batch, -1, self.num_heads, self.d_k).transpose(1, 2) |
| | | |
| | | return q, k, v |
| | | |
| | |
| | | attn_output = torch.matmul(attn_output, value) |
| | | |
| | | attn_output = self.linear_out( |
| | | attn_output.transpose(1, 2) |
| | | .contiguous() |
| | | .view(batch_size, -1, self.num_heads * self.d_k) |
| | | attn_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k) |
| | | ) |
| | | |
| | | return attn_output |
| | |
| | | q, k, v = self.forward_qkv(query, key, value) |
| | | scores = self.compute_att_score(q, k, pos_enc, left_context=left_context) |
| | | return self.forward_attention(v, scores, mask, chunk_mask=chunk_mask) |
| | | |