dilip025 commited on
Commit
6a9cab5
·
verified ·
1 Parent(s): 9cc1754

Upload modeling_roberta_sentiment.py

Browse files
Files changed (1) hide show
  1. modeling_roberta_sentiment.py +86 -0
modeling_roberta_sentiment.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ class TransformerBlock(nn.Module):
6
+ def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
7
+ super().__init__()
8
+ self.attn_norm = nn.LayerNorm(hidden_dim)
9
+ self.ffn_norm = nn.LayerNorm(hidden_dim)
10
+ self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
11
+ self.ffn = nn.Sequential(
12
+ nn.Linear(hidden_dim, ffn_dim),
13
+ nn.GELU(),
14
+ nn.Linear(ffn_dim, hidden_dim),
15
+ nn.Dropout(dropout)
16
+ )
17
+ self.dropout = nn.Dropout(dropout)
18
+
19
+ def forward(self, x, attention_mask):
20
+ batch_size, seq_len, _ = x.size()
21
+
22
+ # No transpose needed since batch_first=True
23
+ x_norm = self.attn_norm(x)
24
+ attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len]
25
+
26
+ assert attn_mask.shape == (batch_size, seq_len), \
27
+ f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}"
28
+
29
+ # Run self-attention (no transpose)
30
+ attn_out, _ = self.attn(
31
+ x_norm, x_norm, x_norm,
32
+ key_padding_mask=attn_mask
33
+ )
34
+
35
+ # Residual + FF
36
+ x = x + self.dropout(attn_out)
37
+ x_norm = self.ffn_norm(x)
38
+ x = x + self.dropout(self.ffn(x_norm))
39
+
40
+ return x
41
+
42
+ class RobertaForSentimentClassification(nn.Module):
43
+ def __init__(self, vocab_size, max_len=128, num_classes=5):
44
+ super().__init__()
45
+ self.hidden_size = 512
46
+ self.max_len = max_len
47
+ self.num_heads = 8
48
+ self.ffn_dim = 2048
49
+ self.num_layers = 6
50
+ self.dropout_rate = 0.1
51
+
52
+ # Embeddings
53
+ self.token_emb = nn.Embedding(vocab_size, self.hidden_size)
54
+ self.position_emb = nn.Embedding(max_len, self.hidden_size)
55
+ self.dropout = nn.Dropout(self.dropout_rate)
56
+
57
+ # Transformer blocks
58
+ self.layers = nn.ModuleList([
59
+ TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate)
60
+ for _ in range(self.num_layers)
61
+ ])
62
+
63
+ # Classification head
64
+ self.classifier = nn.Sequential(
65
+ nn.Linear(self.hidden_size, self.hidden_size),
66
+ nn.GELU(),
67
+ nn.Dropout(self.dropout_rate),
68
+ nn.Linear(self.hidden_size, num_classes)
69
+ )
70
+
71
+ def forward(self, input_ids, attention_mask):
72
+ batch_size, seq_len = input_ids.size()
73
+
74
+ # Embeddings
75
+ positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
76
+ x = self.token_emb(input_ids) + self.position_emb(positions)
77
+ x = self.dropout(x)
78
+
79
+ # Transformer blocks
80
+ for layer in self.layers:
81
+ x = layer(x, attention_mask)
82
+
83
+ # Use <s> token (first position) for classification
84
+ cls_token = x[:, 0] # shape: (batch_size, hidden_size)
85
+ logits = self.classifier(cls_token)
86
+ return logits