import torch import torch.nn as nn import torch.nn.functional as F class TransformerBlock(nn.Module): def __init__(self, hidden_dim, num_heads, ffn_dim, dropout): super().__init__() self.attn_norm = nn.LayerNorm(hidden_dim) self.ffn_norm = nn.LayerNorm(hidden_dim) self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True) self.ffn = nn.Sequential( nn.Linear(hidden_dim, ffn_dim), nn.GELU(), nn.Linear(ffn_dim, hidden_dim), nn.Dropout(dropout) ) self.dropout = nn.Dropout(dropout) def forward(self, x, attention_mask): batch_size, seq_len, _ = x.size() # No transpose needed since batch_first=True x_norm = self.attn_norm(x) attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len] assert attn_mask.shape == (batch_size, seq_len), \ f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}" # Run self-attention (no transpose) attn_out, _ = self.attn( x_norm, x_norm, x_norm, key_padding_mask=attn_mask ) # Residual + FF x = x + self.dropout(attn_out) x_norm = self.ffn_norm(x) x = x + self.dropout(self.ffn(x_norm)) return x class RobertaForSentimentClassification(nn.Module): def __init__(self, vocab_size, max_len=128, num_classes=5): super().__init__() self.hidden_size = 512 self.max_len = max_len self.num_heads = 8 self.ffn_dim = 2048 self.num_layers = 6 self.dropout_rate = 0.1 # Embeddings self.token_emb = nn.Embedding(vocab_size, self.hidden_size) self.position_emb = nn.Embedding(max_len, self.hidden_size) self.dropout = nn.Dropout(self.dropout_rate) # Transformer blocks self.layers = nn.ModuleList([ TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate) for _ in range(self.num_layers) ]) # Classification head self.classifier = nn.Sequential( nn.Linear(self.hidden_size, self.hidden_size), nn.GELU(), nn.Dropout(self.dropout_rate), nn.Linear(self.hidden_size, num_classes) ) def forward(self, input_ids, attention_mask): batch_size, seq_len = input_ids.size() # Embeddings positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len) x = self.token_emb(input_ids) + self.position_emb(positions) x = self.dropout(x) # Transformer blocks for layer in self.layers: x = layer(x, attention_mask) # Use token (first position) for classification cls_token = x[:, 0] # shape: (batch_size, hidden_size) logits = self.classifier(cls_token) return logits