| | |
| | | x = self.dropout(x) |
| | | |
| | | for i, blk in enumerate(self.blocks): |
| | | if ( |
| | | not self.training |
| | | or self.layerdrop == 0 |
| | | or (np.random.random() > self.layerdrop) |
| | | ): |
| | | if not self.training or self.layerdrop == 0 or (np.random.random() > self.layerdrop): |
| | | ab = alibi_bias |
| | | if ab is not None and alibi_scale is not None: |
| | | scale = ( |
| | | alibi_scale[i] |
| | | if alibi_scale.size(0) > 1 |
| | | else alibi_scale.squeeze(0) |
| | | ) |
| | | scale = alibi_scale[i] if alibi_scale.size(0) > 1 else alibi_scale.squeeze(0) |
| | | ab = ab * scale.type_as(ab) |
| | | x, _ = blk(x, padding_mask, ab) |
| | | |
| | |
| | | super().__init__() |
| | | self.num_heads = num_heads |
| | | head_dim = dim // num_heads |
| | | self.scale = qk_scale or head_dim ** -0.5 |
| | | self.scale = qk_scale or head_dim**-0.5 |
| | | |
| | | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) |
| | | self.attn_drop = nn.Dropout(attn_drop) |