Upload modeling_roberta_sentiment.py
Browse files
modeling_roberta_sentiment.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
class TransformerBlock(nn.Module):
|
6 |
+
def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
|
7 |
+
super().__init__()
|
8 |
+
self.attn_norm = nn.LayerNorm(hidden_dim)
|
9 |
+
self.ffn_norm = nn.LayerNorm(hidden_dim)
|
10 |
+
self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
|
11 |
+
self.ffn = nn.Sequential(
|
12 |
+
nn.Linear(hidden_dim, ffn_dim),
|
13 |
+
nn.GELU(),
|
14 |
+
nn.Linear(ffn_dim, hidden_dim),
|
15 |
+
nn.Dropout(dropout)
|
16 |
+
)
|
17 |
+
self.dropout = nn.Dropout(dropout)
|
18 |
+
|
19 |
+
def forward(self, x, attention_mask):
|
20 |
+
batch_size, seq_len, _ = x.size()
|
21 |
+
|
22 |
+
# No transpose needed since batch_first=True
|
23 |
+
x_norm = self.attn_norm(x)
|
24 |
+
attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len]
|
25 |
+
|
26 |
+
assert attn_mask.shape == (batch_size, seq_len), \
|
27 |
+
f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}"
|
28 |
+
|
29 |
+
# Run self-attention (no transpose)
|
30 |
+
attn_out, _ = self.attn(
|
31 |
+
x_norm, x_norm, x_norm,
|
32 |
+
key_padding_mask=attn_mask
|
33 |
+
)
|
34 |
+
|
35 |
+
# Residual + FF
|
36 |
+
x = x + self.dropout(attn_out)
|
37 |
+
x_norm = self.ffn_norm(x)
|
38 |
+
x = x + self.dropout(self.ffn(x_norm))
|
39 |
+
|
40 |
+
return x
|
41 |
+
|
42 |
+
class RobertaForSentimentClassification(nn.Module):
|
43 |
+
def __init__(self, vocab_size, max_len=128, num_classes=5):
|
44 |
+
super().__init__()
|
45 |
+
self.hidden_size = 512
|
46 |
+
self.max_len = max_len
|
47 |
+
self.num_heads = 8
|
48 |
+
self.ffn_dim = 2048
|
49 |
+
self.num_layers = 6
|
50 |
+
self.dropout_rate = 0.1
|
51 |
+
|
52 |
+
# Embeddings
|
53 |
+
self.token_emb = nn.Embedding(vocab_size, self.hidden_size)
|
54 |
+
self.position_emb = nn.Embedding(max_len, self.hidden_size)
|
55 |
+
self.dropout = nn.Dropout(self.dropout_rate)
|
56 |
+
|
57 |
+
# Transformer blocks
|
58 |
+
self.layers = nn.ModuleList([
|
59 |
+
TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate)
|
60 |
+
for _ in range(self.num_layers)
|
61 |
+
])
|
62 |
+
|
63 |
+
# Classification head
|
64 |
+
self.classifier = nn.Sequential(
|
65 |
+
nn.Linear(self.hidden_size, self.hidden_size),
|
66 |
+
nn.GELU(),
|
67 |
+
nn.Dropout(self.dropout_rate),
|
68 |
+
nn.Linear(self.hidden_size, num_classes)
|
69 |
+
)
|
70 |
+
|
71 |
+
def forward(self, input_ids, attention_mask):
|
72 |
+
batch_size, seq_len = input_ids.size()
|
73 |
+
|
74 |
+
# Embeddings
|
75 |
+
positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
|
76 |
+
x = self.token_emb(input_ids) + self.position_emb(positions)
|
77 |
+
x = self.dropout(x)
|
78 |
+
|
79 |
+
# Transformer blocks
|
80 |
+
for layer in self.layers:
|
81 |
+
x = layer(x, attention_mask)
|
82 |
+
|
83 |
+
# Use <s> token (first position) for classification
|
84 |
+
cls_token = x[:, 0] # shape: (batch_size, hidden_size)
|
85 |
+
logits = self.classifier(cls_token)
|
86 |
+
return logits
|