In [1]:
pip install transformers datasets torch scikit-learn accelerate


Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset
from transformers import RobertaTokenizer
import pandas as pd

# Load IMDB dataset
dataset = load_dataset("imdb")

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [3]:
from transformers import DataCollatorWithPadding

# Convert labels to PyTorch format
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Data collator to handle padding dynamically
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [4]:
from transformers import RobertaForSequenceClassification

# Load RoBERTa with classification head
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import transformers
import accelerate

print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)
import torch
print("Torch version:", torch.__version__)

Transformers version: 4.49.0
Accelerate version: 1.4.0
Torch version: 2.2.1+cu121


In [6]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta_imdb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    load_best_model_at_end=True
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3472,0.283987
2,0.2232,0.214965
3,0.1363,0.25088


TrainOutput(global_step=9375, training_loss=0.24904020670572916, metrics={'train_runtime': 2559.5896, 'train_samples_per_second': 29.302, 'train_steps_per_second': 3.663, 'total_flos': 1.9733329152e+16, 'train_loss': 0.24904020670572916, 'epoch': 3.0})

In [7]:
# Evaluate on test set
trainer.evaluate()

# Save model & tokenizer
model.save_pretrained("roberta_imdb_finetuned")
tokenizer.save_pretrained("roberta_imdb_finetuned")


('roberta_imdb_finetuned/tokenizer_config.json',
 'roberta_imdb_finetuned/special_tokens_map.json',
 'roberta_imdb_finetuned/vocab.json',
 'roberta_imdb_finetuned/merges.txt',
 'roberta_imdb_finetuned/added_tokens.json')

In [1]:
import shutil

# Path to the directory
folder_path = "roberta_imdb_finetuned"
zip_filename = "roberta_imdb_finetuned.zip"

# Create a zip archive
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', folder_path)

print(f"Zipped folder saved as {zip_filename}")

Zipped folder saved as roberta_imdb_finetuned.zip
