Spaces:
Running
Running
File size: 3,576 Bytes
d3ce202 5dd6de5 d3ce202 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import pickle
import re
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import os
import zipfile
# Unzip local nltk_data.zip if not already unzipped
nltk_data_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
if not os.path.exists(nltk_data_path):
with zipfile.ZipFile('nltk_data.zip', 'r') as zip_ref:
zip_ref.extractall(nltk_data_path)
# Tell NLTK to use the local data path
nltk.data.path.append(nltk_data_path)
# ============ Load Models and Tokenizers ============
with open("logreg_model.pkl", "rb") as f:
logreg_model = pickle.load(f)
with open("nb_model.pkl", "rb") as f:
nb_model = pickle.load(f)
with open("tfidf_vectorizer.pkl", "rb") as f:
tfidf_vectorizer = pickle.load(f)
with open("glove_tokenizer.pkl", "rb") as f:
glove_tokenizer = pickle.load(f)
model_glove = tf.keras.models.load_model("glove_model.h5")
# ============ Constants ============
MAX_LENGTH = 300
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# ============ Preprocessing ============
def clean_text(text):
text = str(text).lower()
text = re.sub(r'\[.*?\]', '', text)
text = re.sub(r'https?://\S+|www\.\S+', '', text)
text = re.sub(r'<.*?>+', '', text)
text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
text = re.sub(r'\n', ' ', text)
text = re.sub(r'\w*\d\w*', '', text)
text = text.replace('β', '').replace('β', '').replace("β", "'").replace("β", "'")
text = re.sub(r"'s\b", '', text)
tokens = word_tokenize(text)
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
return ' '.join(tokens)
# ============ Prediction ============
def predict_ensemble(text):
cleaned = clean_text(text)
# Check if cleaned text is too short
if len(cleaned.strip()) == 10:
return "Input too short to analyze."
# TF-IDF-based predictions
tfidf_vec = tfidf_vectorizer.transform([cleaned])
prob_nb = nb_model.predict_proba(tfidf_vec)[0][1]
prob_logreg = logreg_model.predict_proba(tfidf_vec)[0][1]
# GloVe prediction
glove_seq = glove_tokenizer.texts_to_sequences([cleaned])
glove_pad = pad_sequences(glove_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
prob_glove = model_glove.predict(glove_pad)[0][0]
# Weighted ensemble
ensemble_score = 0.55 * prob_nb + 0.1 * prob_logreg + 0.35 * prob_glove
label = "β
Real News" if ensemble_score >= 0.45 else "β Fake News"
# Optional: Include probabilities
# Naive Bayes:
# Logistic Regression:
# GloVe Model:
explanation = f"""
**Model 1** {prob_nb:.4f}
**Model 2** {prob_logreg:.4f}
**Model 3** {prob_glove:.4f}
**Ensemble Score:** {ensemble_score:.4f}
**Final Prediction:** {label}
"""
return explanation
# ============ Gradio Interface ============
interface = gr.Interface(
fn=predict_ensemble,
inputs=gr.Textbox(lines=8, placeholder="Paste your news article here...", label="News Article"),
outputs=gr.Markdown(label="Prediction"),
title="π° Fake News Detector",
description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
allow_flagging="never"
)
if __name__ == "__main__":
interface.launch()
|