Model Loading and Testing Instructions

This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. The following code load and test the models on colab notebook.

Step 1: Prerequisites

Import the required Python packages:

from huggingface_hub import login
import torch
import torch.nn as nn
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import re
from sklearn.metrics import accuracy_score
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import login

login("Replace with the key")

Step 2: Define the preprocessing and dataset clas

Run the following class and functions designed to preprocess the test data

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long)
        }

def preprocess_text(text):
    """Clean and preprocess text."""
    text = str(text)
    contractions = {
        "n't": " not",
        "'s": " is",
        "'ll": " will",
        "'ve": " have"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE)
    text = re.sub(r'http\\S+', '', text)
    text = re.sub(r'-', ' ', text)
    text = text.lower()
    text = ' '.join(text.split())
    return text

Step 3: Load the model and tokenizer from Hugging Face Hub

This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub.

print("Loading model and tokenizer...")
REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to
model = RobertaForSequenceClassification.from_pretrained(REPO_NAME)
tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model and tokenizer loaded successfully!")

Step 4: Load test dataset

print("Loading test data...")
test_data_path = "Replace wit your test set path" #Note: Replace with your test set path
test_data = pd.read_csv(test_data_path)

Step 5: Preprocess test data

X_test = test_data['title'].apply(preprocess_text).values
y_test = test_data['labels'].values

Step 6: Prepare the dataset and dataloader

test_dataset = NewsDataset(X_test, y_test, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2)

Step 7: Evaluate the model and calculate accuracy

print("Evaluating the model...")
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"Test Accuracy: {accuracy:.4f}")

Expected output:

Loading model and tokenizer...
Model and tokenizer loaded successfully!
Loading test data...
Evaluating the model...
Test Accuracy: 0.8500