In [4]:
#upload the fine_tuned_model.zip and narrative_texts.csv then run the code for evaluation

import zipfile
import os

#if the folder doesn't exist already, then extract the model
if not os.path.exists("fine_tuned_model"):
 with zipfile.ZipFile("fine_tuned_model.zip", 'r') as zip_ref:
 zip_ref.extractall("fine_tuned_model") #extract all model files into the target folder

print("Model extracted successfully.") #confirmation message

Model extracted successfully.


In [5]:
import torch #for deep learning
from transformers import BertTokenizer, BertForSequenceClassification #model training in bert
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score #evaulation metrics
import pandas as pd
import re #regex

#load fine-tuned model and tokenizer
model_path = "./fine_tuned_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval() #set model to evaluation mode

#load dataset and normalize the text
df = pd.read_csv("narrative_texts.csv")
df['text'] = df['text'].str.lower() #convert to lowercase
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x)) #remove non-alphabetic characters
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip()) #clean extra spaces

#function to swap gendered words in text
def gender_swap(text):
 swaps = {
 " he ": " TEMP ", " she ": " he ", " TEMP ": " she ",
 " his ": " TEMP2 ", " her ": " his ", " TEMP2 ": " her ",
 " him ": " TEMP3 ", " her ": " him ", " TEMP3 ": " her "
 }
 for key, value in swaps.items():
 text = text.replace(key, value)
 return text

#generate swapped gender versions of each sentence
df['text_swapped'] = df['text'].apply(lambda x: gender_swap(" " + x + " "))

#create a mixed dataset of original and swapped texts
df_mixed = pd.concat([df['text'], df['text_swapped']], ignore_index=True)
labels_mixed = [0] * len(df) + [1] * len(df) #label 0 for original, 1 for swapped

#function to evaluate model performance
def evaluate_model(texts, labels):
 inputs = tokenizer(texts.tolist(), truncation=True, padding=True, return_tensors="pt", max_length=128)

 with torch.no_grad():
 outputs = model(**inputs)
 logits = outputs.logits
 preds = torch.argmax(logits, dim=1).numpy()

 acc = accuracy_score(labels, preds)
 precision = precision_score(labels, preds)
 recall = recall_score(labels, preds)
 f1 = f1_score(labels, preds)

 return {
 "Accuracy": round(acc, 4),
 "Precision": round(precision, 4),
 "Recall": round(recall, 4),
 "F1 Score": round(f1, 4)
 }

In [7]:
#evaluating the model on both original and gender-swapped text
metrics = evaluate_model(df_mixed, labels_mixed)

#printing out the evaluation results
print("Model Evaluation Results:")
for metric, value in metrics.items():
 print(f"{metric}: {value}") #prints each metric and its value one by one

Model Evaluation Results:
Accuracy: 0.55
Precision: 0.5385
Recall: 0.7
F1 Score: 0.6087
