RoBERTa-mini / modeling_roberta_sentiment.py
dilip025's picture
Update modeling_roberta_sentiment.py
ebad92f verified
raw
history blame
3.34 kB
import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig
class RobertaSentimentConfig(PretrainedConfig):
model_type = "roberta-sentiment"
def __init__(self,
vocab_size=30000,
hidden_size=512,
num_attention_heads=8,
num_hidden_layers=6,
intermediate_size=2048,
max_position_embeddings=128,
num_labels=5,
hidden_dropout_prob=0.1,
**kwargs):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_attention_heads = num_attention_heads
self.num_hidden_layers = num_hidden_layers
self.intermediate_size = intermediate_size
self.max_position_embeddings = max_position_embeddings
self.num_labels = num_labels
self.hidden_dropout_prob = hidden_dropout_prob
class TransformerBlock(nn.Module):
def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
super().__init__()
self.attn_norm = nn.LayerNorm(hidden_dim)
self.ffn_norm = nn.LayerNorm(hidden_dim)
self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
self.ffn = nn.Sequential(
nn.Linear(hidden_dim, ffn_dim),
nn.GELU(),
nn.Linear(ffn_dim, hidden_dim),
nn.Dropout(dropout)
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, attention_mask):
batch_size, seq_len, _ = x.size()
x_norm = self.attn_norm(x)
attn_mask = (1 - attention_mask).bool()
attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=attn_mask)
x = x + self.dropout(attn_out)
x_norm = self.ffn_norm(x)
x = x + self.dropout(self.ffn(x_norm))
return x
class RobertaForSentimentClassification(PreTrainedModel):
config_class = RobertaSentimentConfig
def __init__(self, config):
super().__init__(config)
self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
self.position_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.layers = nn.ModuleList([
TransformerBlock(config.hidden_size, config.num_attention_heads,
config.intermediate_size, config.hidden_dropout_prob)
for _ in range(config.num_hidden_layers)
])
self.classifier = nn.Sequential(
nn.Linear(config.hidden_size, config.hidden_size),
nn.GELU(),
nn.Dropout(config.hidden_dropout_prob),
nn.Linear(config.hidden_size, config.num_labels)
)
self.init_weights()
def forward(self, input_ids, attention_mask):
batch_size, seq_len = input_ids.size()
positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
x = self.token_emb(input_ids) + self.position_emb(positions)
x = self.dropout(x)
for layer in self.layers:
x = layer(x, attention_mask)
cls_token = x[:, 0]
logits = self.classifier(cls_token)
return {"logits": logits}