# Model Loading and Testing Instructions This document provides step-by-step instructions on how to load our model from the Hugging Face Hub and evaluate it on a test dataset. The following code load and test the models on colab notebook. --- # Step 1: Prerequisites 1. Import the required Python packages: ```python from huggingface_hub import login import torch import torch.nn as nn from transformers import RobertaForSequenceClassification, RobertaTokenizer from torch.utils.data import Dataset, DataLoader import pandas as pd import numpy as np import re from sklearn.metrics import accuracy_score from transformers import AutoModel, AutoTokenizer from huggingface_hub import login ``` 2. Log in by using the account (see our Ed private post & email sent to TAs, thanks!): ```python login("Replace with the key") ``` # Step 2: Define the preprocessing and dataset clas Run the following class and functions designed to preprocess the test data ```python class NewsDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_len=128): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = self.texts[idx] label = self.labels[idx] encoding = self.tokenizer( text, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt" ) return { "input_ids": encoding["input_ids"].squeeze(), "attention_mask": encoding["attention_mask"].squeeze(), "labels": torch.tensor(label, dtype=torch.long) } def preprocess_text(text): """Clean and preprocess text.""" text = str(text) contractions = { "n't": " not", "'s": " is", "'ll": " will", "'ve": " have" } for contraction, expansion in contractions.items(): text = text.replace(contraction, expansion) text = re.sub(r'\$\\d+\.?\\d*\s*(million|billion|trillion)?', r'$ \1', text, flags=re.IGNORECASE) text = re.sub(r'http\\S+', '', text) text = re.sub(r'-', ' ', text) text = text.lower() text = ' '.join(text.split()) return text ``` # Step 3: Load the model and tokenizer from Hugging Face Hub This step loads the pre-trained model and tokenizer, which are hosted on the Hugging Face Hub. ```python print("Loading model and tokenizer...") REPO_NAME = "CIS5190GoGo/CustomModel" #This is where we pushed the model to model = RobertaForSequenceClassification.from_pretrained(REPO_NAME) tokenizer = RobertaTokenizer.from_pretrained(REPO_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) print("Model and tokenizer loaded successfully!") ``` # Step 4: Load test dataset ```python print("Loading test data...") test_data_path = "Replace wit your test set path" #Note: Replace with your test set path test_data = pd.read_csv(test_data_path) ``` # Step 5: Preprocess test data ```python X_test = test_data['title'].apply(preprocess_text).values y_test = test_data['labels'].values ``` # Step 6: Prepare the dataset and dataloader ```python test_dataset = NewsDataset(X_test, y_test, tokenizer) test_loader = DataLoader(test_dataset, batch_size=16, num_workers=2) ``` # Step 7: Evaluate the model and calculate accuracy ```python print("Evaluating the model...") model.eval() all_preds, all_labels = [], [] with torch.no_grad(): for batch in test_loader: input_ids = batch["input_ids"].to(device) attention_mask = batch["attention_mask"].to(device) labels = batch["labels"].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) preds = torch.argmax(outputs.logits, dim=-1) all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) accuracy = accuracy_score(all_labels, all_preds) print(f"Test Accuracy: {accuracy:.4f}") ``` # Expected output: ```python Loading model and tokenizer... Model and tokenizer loaded successfully! Loading test data... Evaluating the model... Test Accuracy: 0.8500 ```