Spaces:
Running
Running
Upload 2 files
Browse files- app.py +101 -0
- requirements.txt +5 -0
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pickle
|
3 |
+
import re
|
4 |
+
import string
|
5 |
+
import numpy as np
|
6 |
+
import tensorflow as tf
|
7 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.stem import WordNetLemmatizer
|
10 |
+
from nltk.tokenize import word_tokenize
|
11 |
+
import nltk
|
12 |
+
nltk.download('punkt')
|
13 |
+
nltk.download('stopwords')
|
14 |
+
nltk.download('wordnet')
|
15 |
+
|
16 |
+
# ============ Load Models and Tokenizers ============
|
17 |
+
with open("logreg_model.pkl", "rb") as f:
|
18 |
+
logreg_model = pickle.load(f)
|
19 |
+
|
20 |
+
with open("nb_model.pkl", "rb") as f:
|
21 |
+
nb_model = pickle.load(f)
|
22 |
+
|
23 |
+
with open("tfidf_vectorizer.pkl", "rb") as f:
|
24 |
+
tfidf_vectorizer = pickle.load(f)
|
25 |
+
|
26 |
+
with open("glove_tokenizer.pkl", "rb") as f:
|
27 |
+
glove_tokenizer = pickle.load(f)
|
28 |
+
|
29 |
+
model_glove = tf.keras.models.load_model("glove_model.h5")
|
30 |
+
|
31 |
+
# ============ Constants ============
|
32 |
+
MAX_LENGTH = 300
|
33 |
+
stop_words = set(stopwords.words('english'))
|
34 |
+
lemmatizer = WordNetLemmatizer()
|
35 |
+
|
36 |
+
# ============ Preprocessing ============
|
37 |
+
def clean_text(text):
|
38 |
+
text = str(text).lower()
|
39 |
+
text = re.sub(r'\[.*?\]', '', text)
|
40 |
+
text = re.sub(r'https?://\S+|www\.\S+', '', text)
|
41 |
+
text = re.sub(r'<.*?>+', '', text)
|
42 |
+
text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
|
43 |
+
text = re.sub(r'\n', ' ', text)
|
44 |
+
text = re.sub(r'\w*\d\w*', '', text)
|
45 |
+
text = text.replace('β', '').replace('β', '').replace("β", "'").replace("β", "'")
|
46 |
+
text = re.sub(r"'s\b", '', text)
|
47 |
+
|
48 |
+
tokens = word_tokenize(text)
|
49 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
|
50 |
+
return ' '.join(tokens)
|
51 |
+
|
52 |
+
# ============ Prediction ============
|
53 |
+
def predict_ensemble(text):
|
54 |
+
cleaned = clean_text(text)
|
55 |
+
|
56 |
+
# Check if cleaned text is too short
|
57 |
+
if len(cleaned.strip()) == 10:
|
58 |
+
return "Input too short to analyze."
|
59 |
+
|
60 |
+
# TF-IDF-based predictions
|
61 |
+
tfidf_vec = tfidf_vectorizer.transform([cleaned])
|
62 |
+
prob_nb = nb_model.predict_proba(tfidf_vec)[0][1]
|
63 |
+
prob_logreg = logreg_model.predict_proba(tfidf_vec)[0][1]
|
64 |
+
|
65 |
+
# GloVe prediction
|
66 |
+
glove_seq = glove_tokenizer.texts_to_sequences([cleaned])
|
67 |
+
glove_pad = pad_sequences(glove_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
|
68 |
+
prob_glove = model_glove.predict(glove_pad)[0][0]
|
69 |
+
|
70 |
+
# Weighted ensemble
|
71 |
+
ensemble_score = 0.55 * prob_nb + 0.1 * prob_logreg + 0.35 * prob_glove
|
72 |
+
label = "β
Real News" if ensemble_score >= 0.45 else "β Fake News"
|
73 |
+
|
74 |
+
# Optional: Include probabilities
|
75 |
+
# Naive Bayes:
|
76 |
+
# Logistic Regression:
|
77 |
+
# GloVe Model:
|
78 |
+
|
79 |
+
explanation = f"""
|
80 |
+
**Model 1** {prob_nb:.4f}
|
81 |
+
**Model 2** {prob_logreg:.4f}
|
82 |
+
**Model 3** {prob_glove:.4f}
|
83 |
+
**Ensemble Score:** {ensemble_score:.4f}
|
84 |
+
**Final Prediction:** {label}
|
85 |
+
"""
|
86 |
+
return explanation
|
87 |
+
|
88 |
+
# ============ Gradio Interface ============
|
89 |
+
interface = gr.Interface(
|
90 |
+
fn=predict_ensemble,
|
91 |
+
inputs=gr.Textbox(lines=8, placeholder="Paste your news article here...", label="News Article"),
|
92 |
+
outputs=gr.Markdown(label="Prediction"),
|
93 |
+
title="π° Fake News Detector",
|
94 |
+
description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method."
|
95 |
+
allow_flagging="never"
|
96 |
+
)
|
97 |
+
|
98 |
+
if __name__ == "__main__":
|
99 |
+
interface.launch()
|
100 |
+
|
101 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
tensorflow
|
3 |
+
scikit-learn
|
4 |
+
nltk
|
5 |
+
numpy
|