|
import torch |
|
import torch.nn as nn |
|
from transformers import PreTrainedModel, PretrainedConfig |
|
|
|
class RobertaSentimentConfig(PretrainedConfig): |
|
model_type = "roberta-sentiment" |
|
|
|
def __init__(self, |
|
vocab_size=30000, |
|
hidden_size=512, |
|
num_attention_heads=8, |
|
num_hidden_layers=6, |
|
intermediate_size=2048, |
|
max_position_embeddings=128, |
|
num_labels=5, |
|
hidden_dropout_prob=0.1, |
|
**kwargs): |
|
super().__init__(**kwargs) |
|
self.vocab_size = vocab_size |
|
self.hidden_size = hidden_size |
|
self.num_attention_heads = num_attention_heads |
|
self.num_hidden_layers = num_hidden_layers |
|
self.intermediate_size = intermediate_size |
|
self.max_position_embeddings = max_position_embeddings |
|
self.num_labels = num_labels |
|
self.hidden_dropout_prob = hidden_dropout_prob |
|
|
|
class TransformerBlock(nn.Module): |
|
def __init__(self, hidden_dim, num_heads, ffn_dim, dropout): |
|
super().__init__() |
|
self.attn_norm = nn.LayerNorm(hidden_dim) |
|
self.ffn_norm = nn.LayerNorm(hidden_dim) |
|
self.attn = nn.MultiheadAttention(hidden_dim, num_heads, dropout=dropout, batch_first=True) |
|
self.ffn = nn.Sequential( |
|
nn.Linear(hidden_dim, ffn_dim), |
|
nn.GELU(), |
|
nn.Linear(ffn_dim, hidden_dim), |
|
nn.Dropout(dropout) |
|
) |
|
self.dropout = nn.Dropout(dropout) |
|
|
|
def forward(self, x, attention_mask): |
|
batch_size, seq_len, _ = x.size() |
|
x_norm = self.attn_norm(x) |
|
attn_mask = (1 - attention_mask).bool() |
|
attn_out, _ = self.attn(x_norm, x_norm, x_norm, key_padding_mask=attn_mask) |
|
x = x + self.dropout(attn_out) |
|
x_norm = self.ffn_norm(x) |
|
x = x + self.dropout(self.ffn(x_norm)) |
|
return x |
|
|
|
class RobertaForSentimentClassification(PreTrainedModel): |
|
config_class = RobertaSentimentConfig |
|
|
|
def __init__(self, config): |
|
super().__init__(config) |
|
|
|
self.token_emb = nn.Embedding(config.vocab_size, config.hidden_size) |
|
self.position_emb = nn.Embedding(config.max_position_embeddings, config.hidden_size) |
|
self.dropout = nn.Dropout(config.hidden_dropout_prob) |
|
|
|
self.layers = nn.ModuleList([ |
|
TransformerBlock(config.hidden_size, config.num_attention_heads, |
|
config.intermediate_size, config.hidden_dropout_prob) |
|
for _ in range(config.num_hidden_layers) |
|
]) |
|
|
|
self.classifier = nn.Sequential( |
|
nn.Linear(config.hidden_size, config.hidden_size), |
|
nn.GELU(), |
|
nn.Dropout(config.hidden_dropout_prob), |
|
nn.Linear(config.hidden_size, config.num_labels) |
|
) |
|
|
|
self.init_weights() |
|
|
|
def forward(self, input_ids, attention_mask): |
|
batch_size, seq_len = input_ids.size() |
|
positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len) |
|
x = self.token_emb(input_ids) + self.position_emb(positions) |
|
x = self.dropout(x) |
|
for layer in self.layers: |
|
x = layer(x, attention_mask) |
|
cls_token = x[:, 0] |
|
logits = self.classifier(cls_token) |
|
return {"logits": logits} |
|
|