# Fake News Detection System (Model Training and Evaluation)

In [1]:
# Install required libraries
!pip install datasets transformers scikit-learn nltk pandas numpy torch matplotlib seaborn tqdm

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from datasets import load_dataset
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [20]:
# Note: Verify berfore running the moodels
import sklearn
import transformers
print(f"NumPy version: {np.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")

print(f"torch: {torch.__version__}")
print(f"transformers: {transformers.__version__}")
print(f"nltk: {nltk.__version__}")

NumPy version: 2.0.2
scikit-learn version: 1.6.1
torch: 2.6.0+cu124
transformers: 4.49.0
nltk: 3.9.1


In [3]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### 1. LOADING DATASETS

In [5]:
print("LOASDING DATASETS...")

# Load ISOT dataset
print("Loading ISOT dataset...")
# Load LIAR dataset
liar_dataset = load_dataset("chengxuphd/liar2")
print(f"LIAR dataset loaded. Available splits: {liar_dataset.keys()}")

# Load ISOT dataset
from google.colab import drive
drive.mount('/content/drive')
fake_news_path = '/content/drive/MyDrive/Fake.csv'
true_news_path = '/content/drive/MyDrive/True.csv'
fake_df = pd.read_csv(fake_news_path)
true_df = pd.read_csv(true_news_path)
print("ISOT dataset loaded.")

LOASDING DATASETS...
Loading ISOT dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.87k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/2.38M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/2.38M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

LIAR dataset loaded. Available splits: dict_keys(['train', 'validation', 'test'])
Mounted at /content/drive
ISOT dataset loaded.


### 2. DATA PREPARATION

In [6]:
print("\nPREPARING DATA...")

# Prepare ISOT dataset
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # Real news
isot_df = pd.concat([fake_df, true_df], ignore_index=True)
isot_df['source'] = 'ISOT'  # Add source column
print(f"Combined ISOT dataset shape: {isot_df.shape}")

# Prepare LIAR dataset - Mapping (0-3 = fake, 4-5 = real)
liar_train_df = pd.DataFrame([
    {
        'statement': item['statement'],
        'label_original': item['label'],
        'label': 1 if item['label'] >= 4 else 0,
        'source': 'LIAR'
    }
    for item in liar_dataset['train']
])
liar_val_df = pd.DataFrame([
    {
        'statement': item['statement'],
        'label_original': item['label'],
        'label': 1 if item['label'] >= 4 else 0,
        'source': 'LIAR'
    }
    for item in liar_dataset['validation']
])
liar_test_df = pd.DataFrame([
    {
        'statement': item['statement'],
        'label_original': item['label'],
        'label': 1 if item['label'] >= 4 else 0,
        'source': 'LIAR'
    }
    for item in liar_dataset['test']
])

# Combine the liar dataframes
liar_df = pd.concat([liar_train_df, liar_val_df, liar_test_df], ignore_index=True)
print(f"Combined LIAR dataset shape: {liar_df.shape}")

# Check the class distribution
liar_class_counts = liar_df['label'].value_counts()
print("\nLIAR binary class distribution:")
print(f"Fake news (0): {liar_class_counts[0]} ({liar_class_counts[0] / len(liar_df) * 100:.2f}%)")
print(f"Real news (1): {liar_class_counts[1]} ({liar_class_counts[1] / len(liar_df) * 100:.2f}%)")

# Rename columns to match
isot_df = isot_df.rename(columns={'text': 'statement'})

# Merge datasets
isot_selected = isot_df[['statement', 'label', 'source']]
liar_selected = liar_df[['statement', 'label', 'source']]
combined_df = pd.concat([isot_selected, liar_selected], ignore_index=True)
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"Combined dataset shape: {combined_df.shape}")

# Show class balance
combined_class_counts = combined_df['label'].value_counts()
print(f"Fake news (0): {combined_class_counts[0]} ({combined_class_counts[0] / len(combined_df) * 100:.2f}%)")
print(f"Real news (1): {combined_class_counts[1]} ({combined_class_counts[1] / len(combined_df) * 100:.2f}%)")


PREPARING DATA...
Combined ISOT dataset shape: (44898, 6)
Combined LIAR dataset shape: (22962, 4)

LIAR binary class distribution:
Fake news (0): 16948 (73.81%)
Real news (1): 6014 (26.19%)
Combined dataset shape: (67860, 3)
Fake news (0): 40429 (59.58%)
Real news (1): 27431 (40.42%)


### 3. TEXT CLEANING

In [7]:
print("\nCLEANING TEXT...")

# Local import
import nltk
nltk.download('punkt_tab')

def clean_text(text):
    # Handle NaN values
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs, HTML tags, punctuation and numbers
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    # Join tokens back into text
    cleaned_text = ' '.join(tokens)
    return cleaned_text

# Apply cleaning function
print("Cleaning text...")
combined_df['cleaned_statement'] = combined_df['statement'].apply(clean_text)
# Show example
print("\nExample of cleaned text:")
print(f"Original: {combined_df['statement'].iloc[0][:100]}...")
print(f"Cleaned: {combined_df['cleaned_statement'].iloc[0][:100]}...")


CLEANING TEXT...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Cleaning text...

Example of cleaned text:
Original:  (Editors note: Attention to language in the second paragraph that some readers may find offensive.)...
Cleaned: editor note attention language second paragraph reader may find offensive reuters local north caroli...


### 4. FEATURE EXTRACTION AND DATA SPLITTING

In [8]:
print("\nFEATURE EXTRACTION AND SPLITTING DATA...")

# Split data
X = combined_df['cleaned_statement']
y = combined_df['label']

# First split: 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Second split: 80% train, 20% validation (from the remaining 80%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# TF-IDF Vectorization
print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# DistilBERT tokenizer setup
print("Setting up DistilBERT tokenizer...")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = NewsDataset(X_train, y_train, tokenizer)
val_dataset = NewsDataset(X_val, y_val, tokenizer)
test_dataset = NewsDataset(X_test, y_test, tokenizer)

# Create dataloaders
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)


FEATURE EXTRACTION AND SPLITTING DATA...
Training set: 40716 samples
Validation set: 13572 samples
Test set: 13572 samples
Creating TF-IDF features...
Setting up DistilBERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

### 5. MODEL TRAINING
#### 5.1 Logistic Regression

In [9]:
print("\nTRAINING MODELS...")

# Logistic Regression
print("\n-- Training Logistic Regression --")
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}
lr_model = LogisticRegression(max_iter=1000, class_weight=class_weight_dict, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
lr_val_preds = lr_model.predict(X_val_tfidf)
lr_val_acc = accuracy_score(y_val, lr_val_preds)
lr_val_f1 = f1_score(y_val, lr_val_preds)
print(f"Validation Accuracy: {lr_val_acc:.4f}")
print(f"Validation F1 Score: {lr_val_f1:.4f}")

# Hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, class_weight=class_weight_dict, random_state=42),
    param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1
)

print("Running grid search...")
grid_search.fit(X_train_tfidf, y_train)

print(f"Best parameters: {grid_search.best_params_}")
lr_best_model = grid_search.best_estimator_

# Test set evaluation
lr_test_preds = lr_best_model.predict(X_test_tfidf)
lr_test_acc = accuracy_score(y_test, lr_test_preds)
lr_test_precision = precision_score(y_test, lr_test_preds)
lr_test_recall = recall_score(y_test, lr_test_preds)
lr_test_f1 = f1_score(y_test, lr_test_preds)
print("Logistic Regression Test Results:")
print(f"Accuracy: {lr_test_acc:.4f}")
print(f"F1 Score: {lr_test_f1:.4f}")


TRAINING MODELS...

-- Training Logistic Regression --
Validation Accuracy: 0.8943
Validation F1 Score: 0.8650
Running grid search...
Best parameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Logistic Regression Test Results:
Accuracy: 0.9042
F1 Score: 0.8762


#### 5.2 Random Forest

In [10]:
# Random Forest
print("\n-- Training Random Forest --")
import time

# model training(time)
rf_start_time = time.time()
rf_model = RandomForestClassifier(n_estimators=100, class_weight=class_weight_dict, random_state=42, n_jobs=-1)
rf_model.fit(X_train_tfidf, y_train)
rf_train_time = time.time() - rf_start_time
print(f"Random Forest trained in {rf_train_time:.2f} seconds")

rf_val_preds = rf_model.predict(X_val_tfidf)
rf_val_acc = accuracy_score(y_val, rf_val_preds)
rf_val_f1 = f1_score(y_val, rf_val_preds)
print(f"Validation Accuracy: {rf_val_acc:.4f}")
print(f"Validation F1 Score: {rf_val_f1:.4f}")

# Hyperparameter tuning
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(
    RandomForestClassifier(class_weight=class_weight_dict, random_state=42),
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    random_state=42
)

print("Running random search...")
rf_tuning_start = time.time()
random_search.fit(X_train_tfidf, y_train)
rf_tuning_time = time.time() - rf_tuning_start
print(f"Random Forest hyperparameter tuning completed in {rf_tuning_time:.2f} seconds")
print(f"Best parameters: {random_search.best_params_}")
rf_best_model = random_search.best_estimator_

# Test set evaluation
rf_test_preds = rf_best_model.predict(X_test_tfidf)
rf_test_acc = accuracy_score(y_test, rf_test_preds)
rf_test_precision = precision_score(y_test, rf_test_preds)
rf_test_recall = recall_score(y_test, rf_test_preds)
rf_test_f1 = f1_score(y_test, rf_test_preds)
print("Random Forest Test Results:")
print(f"Accuracy: {rf_test_acc:.4f}")
print(f"F1 Score: {rf_test_f1:.4f}")


-- Training Random Forest --
Random Forest trained in 121.20 seconds
Validation Accuracy: 0.9077
Validation F1 Score: 0.8738
Running random search...
Random Forest hyperparameter tuning completed in 287.50 seconds
Best parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Random Forest Test Results:
Accuracy: 0.9083
F1 Score: 0.8752


#### 5.3 DistilBERT

In [11]:
# DistilBERT model
class DistilBERTClassifier(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(DistilBERTClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Train DistilBERT
print("\n-- Training DistilBERT --")
import time

print("""NOTE: DistilBERT training might take 30-45 minutes if Colab's T4 GPU is used.
      (Depends on no. of epochs+ CPU/GPU/TPU)""")
distilbert_start_time = time.time()
distilbert_model = DistilBERTClassifier().to(device)
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 2  #####
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
best_val_f1 = 0
best_model_state = None

for epoch in range(num_epochs):
    epoch_start = time.time()
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    distilbert_model.train()
    train_loss = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        train_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(distilbert_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    epoch_time = time.time() - epoch_start
    avg_train_loss = train_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss:.4f}")
    print(f"Epoch completed in {epoch_time:.2f} seconds")

    # Validation part
    distilbert_model.eval()
    val_loss = 0
    val_preds = []
    val_labels = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            val_preds.extend(preds.cpu().tolist())
            val_labels.extend(labels.cpu().tolist())

    avg_val_loss = val_loss / len(val_dataloader)
    val_acc = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)
    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(f"Validation F1 Score: {val_f1:.4f}")

    # Save best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = distilbert_model.state_dict().copy()
        print(f"New best model saved! F1: {best_val_f1:.4f}")


-- Training DistilBERT --
NOTE: DistilBERT training might take 30-45 minutes if Colab's T4 GPU is used.
      (Depends on no. of epochs+ CPU/GPU/TPU)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]




Epoch 1/2


Training: 100%|██████████| 2545/2545 [07:26<00:00,  5.70it/s]


Training loss: 0.2021
Epoch completed in 446.79 seconds


Validation: 100%|██████████| 849/849 [01:25<00:00,  9.89it/s]


Validation loss: 0.1998
Validation Accuracy: 0.9110
Validation F1 Score: 0.8821
New best model saved! F1: 0.8821

Epoch 2/2


Training: 100%|██████████| 2545/2545 [07:34<00:00,  5.60it/s]


Training loss: 0.1618
Epoch completed in 454.54 seconds


Validation: 100%|██████████| 849/849 [01:26<00:00,  9.83it/s]

Validation loss: 0.2010
Validation Accuracy: 0.9107
Validation F1 Score: 0.8855
New best model saved! F1: 0.8855





In [12]:
# Load best model
if best_model_state:
    distilbert_model.load_state_dict(best_model_state)

# Calculate total training time
distilbert_total_time = time.time() - distilbert_start_time
print(f"DistilBERT total training time: {distilbert_total_time:.2f} seconds")
print(f"That's about {distilbert_total_time/60:.2f} minutes")

# DistilBERT Eval
print("\n-- Evaluating DistilBERT --")
distilbert_model.eval()
test_preds = []
test_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, 1)

        test_preds.extend(preds.cpu().tolist())
        test_labels.extend(labels.cpu().tolist())

# Calculate metrics
distilbert_acc = accuracy_score(test_labels, test_preds)
distilbert_precision = precision_score(test_labels, test_preds)
distilbert_recall = recall_score(test_labels, test_preds)
distilbert_f1 = f1_score(test_labels, test_preds)
print("DistilBERT Test Results:")
print(f"Accuracy: {distilbert_acc:.4f}")
print(f"Precision: {distilbert_precision:.4f}")
print(f"Recall: {distilbert_recall:.4f}")
print(f"F1 Score: {distilbert_f1:.4f}")

DistilBERT total training time: 1079.66 seconds
That's about 17.99 minutes

-- Evaluating DistilBERT --


Testing: 100%|██████████| 849/849 [01:26<00:00,  9.83it/s]


DistilBERT Test Results:
Accuracy: 0.9100
Precision: 0.9194
Recall: 0.8522
F1 Score: 0.8845


### 6. MODEL COMPARISON

In [13]:
print("\nCOMPARING MODELS...")

models = ['Logistic Regression', 'Random Forest', 'DistilBERT']
accuracy_scores = [lr_test_acc, rf_test_acc, distilbert_acc]
precision_scores = [lr_test_precision, rf_test_precision, distilbert_precision]
recall_scores = [lr_test_recall, rf_test_recall, distilbert_recall]
f1_scores = [lr_test_f1, rf_test_f1, distilbert_f1]

comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracy_scores,
    'Precision': precision_scores,
    'Recall': recall_scores,
    'F1 Score': f1_scores
})
print("\nModel Performance Comparison:")
print(comparison_df)

# Find best model
best_model_idx = np.argmax(f1_scores)
best_model_name = models[best_model_idx]
best_f1 = f1_scores[best_model_idx]
print(f"\nBest model: {best_model_name} with F1 score: {best_f1:.4f}")


COMPARING MODELS...

Model Performance Comparison:
                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.904215   0.917431  0.838498  0.876190
1        Random Forest  0.908341   0.973649  0.794750  0.875151
2           DistilBERT  0.910035   0.919371  0.852169  0.884495

Best model: DistilBERT with F1 score: 0.8845


### 7. VISUALIZATIONS

In [15]:
# VISUALIZATIONS FOR THE FINAL REPORT
print("\nCREATING VISUALIZATIONS FOR FINAL REPORT...")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
from wordcloud import WordCloud

# Create a 'visualizations' folder
os.makedirs('visualizations', exist_ok=True)

# 1. Class distribution visualization
plt.figure(figsize=(10, 6))
labels = ['Fake News', 'Real News']
sizes = [combined_class_counts[0], combined_class_counts[1]]
colors = ['#ff9999', '#66b3ff']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.title('Class Distribution in Dataset')
plt.savefig('visualizations/class_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Class distribution visualization saved")

# 2. Confusion matrices for each model
# LR confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, lr_test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('Logistic Regression Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('visualizations/lr_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# RF confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, rf_test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('Random Forest Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('visualizations/rf_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()

# DistilBERT confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(test_labels, test_preds)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title('DistilBERT Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('visualizations/distilbert_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Confusion matrices saved")

# 3. Feature importance for Random Forest
if hasattr(rf_best_model, 'feature_importances_'):
    # get feature importance
    importances = rf_best_model.feature_importances_

    # convert the feature names from the TF-IDF vectorizer
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # sort feature importances in descending order
    indices = np.argsort(importances)[::-1]

    # top 20 features
    top_k = 20
    top_indices = indices[:top_k]
    plt.figure(figsize=(12, 8))
    plt.title(f'Top {top_k} Feature Importances - Random Forest')
    plt.barh(range(top_k), importances[top_indices], align='center')
    plt.yticks(range(top_k), [feature_names[i] for i in top_indices])
    plt.xlabel('Relative Importance')
    plt.gca().invert_yaxis()  # Highest importance at top
    plt.savefig('visualizations/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("✓ Feature importance visualization saved")

# 4. Word clouds for fake vs real news
# Fake news word cloud
fake_text = ' '.join(combined_df[combined_df['label'] == 0]['cleaned_statement'])
if len(fake_text) > 0:
    wordcloud = WordCloud(width=800, height=400, background_color='white',
                         max_words=200, contour_width=3, contour_color='steelblue').generate(fake_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Fake News')
    plt.savefig('visualizations/fake_news_wordcloud.png', dpi=300, bbox_inches='tight')
    plt.close()

# Real news word cloud
real_text = ' '.join(combined_df[combined_df['label'] == 1]['cleaned_statement'])
if len(real_text) > 0:
    wordcloud = WordCloud(width=800, height=400, background_color='white',
                         max_words=200, contour_width=3, contour_color='steelblue').generate(real_text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud - Real News')
    plt.savefig('visualizations/real_news_wordcloud.png', dpi=300, bbox_inches='tight')
    plt.close()
print("✓ Word clouds saved")

# 5. Model performance comparison
plt.figure(figsize=(12, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
x = np.arange(len(metrics))
width = 0.25

# Plot bars for each model
plt.bar(x - width, [lr_test_acc, lr_test_precision, lr_test_recall, lr_test_f1],
        width, label='Logistic Regression', color='#ff9999')
plt.bar(x, [rf_test_acc, rf_test_precision, rf_test_recall, rf_test_f1],
        width, label='Random Forest', color='#66b3ff')
plt.bar(x + width, [distilbert_acc, distilbert_precision, distilbert_recall, distilbert_f1],
        width, label='DistilBERT', color='#99ff99')

plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x, metrics)
plt.ylim(0, 1.0)
plt.legend(loc='lower right')
plt.savefig('visualizations/model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Model comparison visualization saved")

# 6. ROC curves for all models
plt.figure(figsize=(10, 8))

# Function to plot ROC curve
def plot_roc_curve(y_true, y_pred_prob, label, color):
    if len(y_pred_prob.shape) > 1 and y_pred_prob.shape[1] == 2:
        y_pred_prob = y_pred_prob[:, 1]

    fpr, tpr, _ = roc_curve(y_true, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=color, lw=2, label=f'{label} (AUC = {roc_auc:.2f})')

# LR ROC
lr_probs = lr_best_model.predict_proba(X_test_tfidf)
plot_roc_curve(y_test, lr_probs, 'Logistic Regression', '#ff9999')

# RF ROC
rf_probs = rf_best_model.predict_proba(X_test_tfidf)
plot_roc_curve(y_test, rf_probs, 'Random Forest', '#66b3ff')

# DistilBERT ROC - need to get probabilities**
distilbert_probs = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs, dim=1)
        distilbert_probs.extend(probs.cpu().numpy())

distilbert_probs = np.array(distilbert_probs)
plot_roc_curve(test_labels, distilbert_probs, 'DistilBERT', '#99ff99')

# Plot diagonal
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc='lower right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig('visualizations/roc_curves.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ ROC curves saved")

# 7. Text length distribution
text_lengths = combined_df['statement'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(10, 6))
plt.hist(text_lengths, bins=50, alpha=0.7, color='skyblue')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Text Length Distribution')
plt.axvline(text_lengths.mean(), color='r', linestyle='--', label=f'Mean: {text_lengths.mean():.1f} words')
plt.axvline(text_lengths.median(), color='g', linestyle='--', label=f'Median: {text_lengths.median():.1f} words')
plt.legend()
plt.savefig('visualizations/text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.close()
print("✓ Text length distribution saved")
print("✓ ALL DONE!")


CREATING VISUALIZATIONS FOR FINAL REPORT...
✓ Class distribution visualization saved
✓ Confusion matrices saved
✓ Feature importance visualization saved
✓ Word clouds saved
✓ Model comparison visualization saved
✓ ROC curves saved
✓ Text length distribution saved
✓ ALL DONE!


### 8. SAVE MODELS AND FINAL CONCLUSION

In [16]:
print("\nSAVING MODELS...")

os.makedirs('models', exist_ok=True)

import pickle
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('models/lr_model.pkl', 'wb') as f:
    pickle.dump(lr_best_model, f)

with open('models/rf_model.pkl', 'wb') as f:
    pickle.dump(rf_best_model, f)

torch.save(distilbert_model.state_dict(), 'models/distilbert_model.pt')

print("Models saved!")

# CONCLUSIONS
print("\nCONCLUSIONS")
print(f"Best model: {best_model_name}")
print(f"Dataset stats: {combined_class_counts[0]} fake and {combined_class_counts[1]} real news articles")
print("Project Completed!")


SAVING MODELS...
Models saved!

CONCLUSIONS
Best model: DistilBERT
Dataset stats: 40429 fake and 27431 real news articles
Project Completed!
