dilip025 commited on
Commit
ebad92f
·
verified ·
1 Parent(s): 6a9cab5

Update modeling_roberta_sentiment.py

Browse files
Files changed (1) hide show
  1. modeling_roberta_sentiment.py +86 -86
modeling_roberta_sentiment.py CHANGED
@@ -1,86 +1,86 @@
1
- import torch
2
- import torch.nn as nn
3
- import torch.nn.functional as F
4
-
5
- class TransformerBlock(nn.Module):
6
- def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
7
- super().__init__()
8
- self.attn_norm = nn.LayerNorm(hidden_dim)
9
- self.ffn_norm = nn.LayerNorm(hidden_dim)
10
- self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
11
- self.ffn = nn.Sequential(
12
- nn.Linear(hidden_dim, ffn_dim),
13
- nn.GELU(),
14
- nn.Linear(ffn_dim, hidden_dim),
15
- nn.Dropout(dropout)
16
- )
17
- self.dropout = nn.Dropout(dropout)
18
-
19
- def forward(self, x, attention_mask):
20
- batch_size, seq_len, _ = x.size()
21
-
22
- # No transpose needed since batch_first=True
23
- x_norm = self.attn_norm(x)
24
- attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len]
25
-
26
- assert attn_mask.shape == (batch_size, seq_len), \
27
- f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}"
28
-
29
- # Run self-attention (no transpose)
30
- attn_out, _ = self.attn(
31
- x_norm, x_norm, x_norm,
32
- key_padding_mask=attn_mask
33
- )
34
-
35
- # Residual + FF
36
- x = x + self.dropout(attn_out)
37
- x_norm = self.ffn_norm(x)
38
- x = x + self.dropout(self.ffn(x_norm))
39
-
40
- return x
41
-
42
- class RobertaForSentimentClassification(nn.Module):
43
- def __init__(self, vocab_size, max_len=128, num_classes=5):
44
- super().__init__()
45
- self.hidden_size = 512
46
- self.max_len = max_len
47
- self.num_heads = 8
48
- self.ffn_dim = 2048
49
- self.num_layers = 6
50
- self.dropout_rate = 0.1
51
-
52
- # Embeddings
53
- self.token_emb = nn.Embedding(vocab_size, self.hidden_size)
54
- self.position_emb = nn.Embedding(max_len, self.hidden_size)
55
- self.dropout = nn.Dropout(self.dropout_rate)
56
-
57
- # Transformer blocks
58
- self.layers = nn.ModuleList([
59
- TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate)
60
- for _ in range(self.num_layers)
61
- ])
62
-
63
- # Classification head
64
- self.classifier = nn.Sequential(
65
- nn.Linear(self.hidden_size, self.hidden_size),
66
- nn.GELU(),
67
- nn.Dropout(self.dropout_rate),
68
- nn.Linear(self.hidden_size, num_classes)
69
- )
70
-
71
- def forward(self, input_ids, attention_mask):
72
- batch_size, seq_len = input_ids.size()
73
-
74
- # Embeddings
75
- positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
76
- x = self.token_emb(input_ids) + self.position_emb(positions)
77
- x = self.dropout(x)
78
-
79
- # Transformer blocks
80
- for layer in self.layers:
81
- x = layer(x, attention_mask)
82
-
83
- # Use <s> token (first position) for classification
84
- cls_token = x[:, 0] # shape: (batch_size, hidden_size)
85
- logits = self.classifier(cls_token)
86
- return logits
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel, PretrainedConfig
4
+
5
+ class RobertaSentimentConfig(PretrainedConfig):
6
+ model_type = "roberta-sentiment"
7
+
8
+ def __init__(self,
9
+ vocab_size=30000,
10
+ hidden_size=512,
11
+ num_attention_heads=8,
12
+ num_hidden_layers=6,
13
+ intermediate_size=2048,
14
+ max_position_embeddings=128,
15
+ num_labels=5,
16
+ hidden_dropout_prob=0.1,
17
+ **kwargs):
18
+ super().__init__(**kwargs)
19
+ self.vocab_size = vocab_size
20
+ self.hidden_size = hidden_size
21
+ self.num_attention_heads = num_attention_heads
22
+ self.num_hidden_layers = num_hidden_layers
23
+ self.intermediate_size = intermediate_size
24
+ self.max_position_embeddings = max_position_embeddings
25
+ self.num_labels = num_labels
26
+ self.hidden_dropout_prob = hidden_dropout_prob
27
+
28
+ class TransformerBlock(nn.Module):
29
+ def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
30
+ super().__init__()
31
+ self.attn_norm = nn.LayerNorm(hidden_dim)
32
+ self.ffn_norm = nn.LayerNorm(hidden_dim)
33
+ self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
34
+ self.ffn = nn.Sequential(
35
+ nn.Linear(hidden_dim, ffn_dim),
36
+ nn.GELU(),
37
+ nn.Linear(ffn_dim, hidden_dim),
38
+ nn.Dropout(dropout)
39
+ )
40
+ self.dropout = nn.Dropout(dropout)
41
+
42
+ def forward(self, x, attention_mask):
43
+ batch_size, seq_len, _ = x.size()
44
+ x_norm = self.attn_norm(x)
45
+ attn_mask = (1 - attention_mask).bool()
46
+ attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=attn_mask)
47
+ x = x + self.dropout(attn_out)
48
+ x_norm = self.ffn_norm(x)
49
+ x = x + self.dropout(self.ffn(x_norm))
50
+ return x
51
+
52
+ class RobertaForSentimentClassification(PreTrainedModel):
53
+ config_class = RobertaSentimentConfig
54
+
55
+ def __init__(self, config):
56
+ super().__init__(config)
57
+
58
+ self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
59
+ self.position_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size)
60
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
61
+
62
+ self.layers = nn.ModuleList([
63
+ TransformerBlock(config.hidden_size, config.num_attention_heads,
64
+ config.intermediate_size, config.hidden_dropout_prob)
65
+ for _ in range(config.num_hidden_layers)
66
+ ])
67
+
68
+ self.classifier = nn.Sequential(
69
+ nn.Linear(config.hidden_size, config.hidden_size),
70
+ nn.GELU(),
71
+ nn.Dropout(config.hidden_dropout_prob),
72
+ nn.Linear(config.hidden_size, config.num_labels)
73
+ )
74
+
75
+ self.init_weights()
76
+
77
+ def forward(self, input_ids, attention_mask):
78
+ batch_size, seq_len = input_ids.size()
79
+ positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
80
+ x = self.token_emb(input_ids) + self.position_emb(positions)
81
+ x = self.dropout(x)
82
+ for layer in self.layers:
83
+ x = layer(x, attention_mask)
84
+ cls_token = x[:, 0]
85
+ logits = self.classifier(cls_token)
86
+ return {"logits": logits}