import torch import torch.nn as nn from transformers import PreTrainedModel, PretrainedConfig class RobertaSentimentConfig(PretrainedConfig): model_type = "roberta-sentiment" def __init__(self, vocab_size=30000, hidden_size=512, num_attention_heads=8, num_hidden_layers=6, intermediate_size=2048, max_position_embeddings=128, num_labels=5, hidden_dropout_prob=0.1, **kwargs): super().__init__(**kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_attention_heads = num_attention_heads self.num_hidden_layers = num_hidden_layers self.intermediate_size = intermediate_size self.max_position_embeddings = max_position_embeddings self.num_labels = num_labels self.hidden_dropout_prob = hidden_dropout_prob class TransformerBlock(nn.Module): def __init__(self, hidden_dim, num_heads, ffn_dim, dropout): super().__init__() self.attn_norm = nn.LayerNorm(hidden_dim) self.ffn_norm = nn.LayerNorm(hidden_dim) self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True) self.ffn = nn.Sequential( nn.Linear(hidden_dim, ffn_dim), nn.GELU(), nn.Linear(ffn_dim, hidden_dim), nn.Dropout(dropout) ) self.dropout = nn.Dropout(dropout) def forward(self, x, attention_mask): batch_size, seq_len, _ = x.size() x_norm = self.attn_norm(x) attn_mask = (1 - attention_mask).bool() attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=attn_mask) x = x + self.dropout(attn_out) x_norm = self.ffn_norm(x) x = x + self.dropout(self.ffn(x_norm)) return x class RobertaForSentimentClassification(PreTrainedModel): config_class = RobertaSentimentConfig def __init__(self, config): super().__init__(config) self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size) self.position_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.layers = nn.ModuleList([ TransformerBlock(config.hidden_size, config.num_attention_heads, config.intermediate_size, config.hidden_dropout_prob) for _ in range(config.num_hidden_layers) ]) self.classifier = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), nn.GELU(), nn.Dropout(config.hidden_dropout_prob), nn.Linear(config.hidden_size, config.num_labels) ) self.init_weights() def forward(self, input_ids, attention_mask): batch_size, seq_len = input_ids.size() positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len) x = self.token_emb(input_ids) + self.position_emb(positions) x = self.dropout(x) for layer in self.layers: x = layer(x, attention_mask) cls_token = x[:, 0] logits = self.classifier(cls_token) return {"logits": logits}