dilip025
/

RoBERTa-mini

Text Classification

roberta-sentiment

sentiment-classification

Model card Files Files and versions

RoBERTa-mini / modeling_roberta_sentiment.py

dilip025's picture

Upload modeling_roberta_sentiment.py

6a9cab5 verified 3 months ago

3.07 kB

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	class TransformerBlock(nn.Module):
	def __init__(self, hidden_dim, num_heads, ffn_dim, dropout):
	super().__init__()
	self.attn_norm = nn.LayerNorm(hidden_dim)
	self.ffn_norm = nn.LayerNorm(hidden_dim)
	self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True)
	self.ffn = nn.Sequential(
	nn.Linear(hidden_dim, ffn_dim),
	nn.GELU(),
	nn.Linear(ffn_dim, hidden_dim),
	nn.Dropout(dropout)
	)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x, attention_mask):
	batch_size, seq_len, _ = x.size()

	# No transpose needed since batch_first=True
	x_norm = self.attn_norm(x)
	attn_mask = (1 - attention_mask).bool() # [batch_size, seq_len]

	assert attn_mask.shape == (batch_size, seq_len), \
	f"Expected {batch_size=} and {seq_len=}, got {attn_mask.shape}"

	# Run self-attention (no transpose)
	attn_out, _ = self.attn(
	x_norm, x_norm, x_norm,
	key_padding_mask=attn_mask
	)

	# Residual + FF
	x = x + self.dropout(attn_out)
	x_norm = self.ffn_norm(x)
	x = x + self.dropout(self.ffn(x_norm))

	return x

	class RobertaForSentimentClassification(nn.Module):
	def __init__(self, vocab_size, max_len=128, num_classes=5):
	super().__init__()
	self.hidden_size = 512
	self.max_len = max_len
	self.num_heads = 8
	self.ffn_dim = 2048
	self.num_layers = 6
	self.dropout_rate = 0.1

	# Embeddings
	self.token_emb = nn.Embedding(vocab_size, self.hidden_size)
	self.position_emb = nn.Embedding(max_len, self.hidden_size)
	self.dropout = nn.Dropout(self.dropout_rate)

	# Transformer blocks
	self.layers = nn.ModuleList([
	TransformerBlock(self.hidden_size, self.num_heads, self.ffn_dim, self.dropout_rate)
	for _ in range(self.num_layers)
	])

	# Classification head
	self.classifier = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size),
	nn.GELU(),
	nn.Dropout(self.dropout_rate),
	nn.Linear(self.hidden_size, num_classes)
	)

	def forward(self, input_ids, attention_mask):
	batch_size, seq_len = input_ids.size()

	# Embeddings
	positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
	x = self.token_emb(input_ids) + self.position_emb(positions)
	x = self.dropout(x)

	# Transformer blocks
	for layer in self.layers:
	x = layer(x, attention_mask)

	# Use <s> token (first position) for classification
	cls_token = x[:, 0] # shape: (batch_size, hidden_size)
	logits = self.classifier(cls_token)
	return logits