python/FunASR-XL.git

			@@ -11,6 +11,7 @@
			import torch
			from torch import nn


			class MultiHeadedAttentionReturnWeight(nn.Module):
			"""Multi-Head Attention layer.

			@@ -77,19 +78,19 @@
			mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
			min_value = torch.finfo(scores.dtype).min
			scores = scores.masked_fill(mask, min_value)
			self.attn = torch.softmax(scores, dim=-1).masked_fill(
			attn = torch.softmax(scores, dim=-1).masked_fill(
			mask, 0.0
			) # (batch, head, time1, time2)
			else:
			self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
			attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)

			p_attn = self.dropout(self.attn)
			p_attn = self.dropout(attn)
			x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
			x = (
			x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
			) # (batch, time1, d_model)

			return self.linear_out(x), self.attn # (batch, time1, d_model)
			return self.linear_out(x), attn # (batch, time1, d_model)

			def forward(self, query, key, value, mask):
			"""Compute scaled dot product attention.
			@@ -108,5 +109,3 @@
			q, k, v = self.forward_qkv(query, key, value)
			scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)
			return self.forward_attention(v, scores, mask)