-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patheigen_transformer.py
More file actions
77 lines (64 loc) · 2.1 KB
/
eigen_transformer.py
File metadata and controls
77 lines (64 loc) · 2.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from __future__ import annotations
import torch
import torch.nn as nn
from eigen_attention import EigenAttention
class EigenTransformerBlock(nn.Module):
"""
A minimal transformer block that uses EigenAttention instead of
dot-product attention, followed by a standard MLP.
"""
def __init__(
self,
dim: int,
num_heads: int,
mlp_ratio: float = 4.0,
dropout: float = 0.0,
attn_kwargs: dict | None = None,
) -> None:
"""
Args:
dim: Model dimension.
num_heads: Number of attention heads.
mlp_ratio: Expansion factor for MLP hidden size.
dropout: Dropout rate for residual paths and MLP.
attn_kwargs: Extra keyword arguments for EigenAttention
(e.g., {"causal": True}).
"""
super().__init__()
if attn_kwargs is None:
attn_kwargs = {}
self.dim = dim
self.num_heads = num_heads
self.norm1 = nn.LayerNorm(dim)
self.attn = EigenAttention(dim=dim, num_heads=num_heads, **attn_kwargs)
self.dropout1 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(dim)
hidden_dim = int(dim * mlp_ratio)
self.mlp = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Linear(hidden_dim, dim),
)
self.dropout2 = nn.Dropout(dropout)
def forward(
self,
x: torch.Tensor,
attn_mask: torch.Tensor | None = None,
) -> torch.Tensor:
"""
Args:
x: (B, L, D) input sequence.
attn_mask: Optional additive mask compatible with EigenAttention.
Returns:
x: (B, L, D) output.
"""
if x.ndim != 3:
raise ValueError(f"EigenTransformerBlock expects (B, L, D), got {x.shape}")
# Attention + residual
h = self.norm1(x)
attn_out, _ = self.attn(h, attn_mask=attn_mask)
x = x + self.dropout1(attn_out)
# MLP + residual
h = self.norm2(x)
x = x + self.dropout2(self.mlp(h))
return x