File size: 3,074 Bytes
6a9cab5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import torch
import torch.nn as nn
import torch.nn.functional as F
class TransformerBlock(nn.Module):
def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
super().__init__()
self.attn_norm = nn.LayerNorm(hidden_dim)
self.ffn_norm = nn.LayerNorm(hidden_dim)
self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
self.ffn = nn.Sequential(
nn.Linear(hidden_dim, ffn_dim),
nn.GELU(),
nn.Linear(ffn_dim, hidden_dim),
nn.Dropout(dropout)
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, attention_mask):
batch_size, seq_len, _ = x.size()
# No transpose needed since batch_first=True
x_norm = self.attn_norm(x)
attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len]
assert attn_mask.shape == (batch_size, seq_len), \
f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}"
# Run self-attention (no transpose)
attn_out, _ = self.attn(
x_norm, x_norm, x_norm,
key_padding_mask=attn_mask
)
# Residual + FF
x = x + self.dropout(attn_out)
x_norm = self.ffn_norm(x)
x = x + self.dropout(self.ffn(x_norm))
return x
class RobertaForSentimentClassification(nn.Module):
def __init__(self, vocab_size, max_len=128, num_classes=5):
super().__init__()
self.hidden_size = 512
self.max_len = max_len
self.num_heads = 8
self.ffn_dim = 2048
self.num_layers = 6
self.dropout_rate = 0.1
# Embeddings
self.token_emb = nn.Embedding(vocab_size, self.hidden_size)
self.position_emb = nn.Embedding(max_len, self.hidden_size)
self.dropout = nn.Dropout(self.dropout_rate)
# Transformer blocks
self.layers = nn.ModuleList([
TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate)
for _ in range(self.num_layers)
])
# Classification head
self.classifier = nn.Sequential(
nn.Linear(self.hidden_size, self.hidden_size),
nn.GELU(),
nn.Dropout(self.dropout_rate),
nn.Linear(self.hidden_size, num_classes)
)
def forward(self, input_ids, attention_mask):
batch_size, seq_len = input_ids.size()
# Embeddings
positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
x = self.token_emb(input_ids) + self.position_emb(positions)
x = self.dropout(x)
# Transformer blocks
for layer in self.layers:
x = layer(x, attention_mask)
# Use <s> token (first position) for classification
cls_token = x[:, 0] # shape: (batch_size, hidden_size)
logits = self.classifier(cls_token)
return logits
|