import gradio as gr
import pickle
import re
import string
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import os
import zipfile

# Unzip local nltk_data.zip if not already unzipped
nltk_data_path = os.path.join(os.path.dirname(__file__), 'nltk_data')
if not os.path.exists(nltk_data_path):
    with zipfile.ZipFile('nltk_data.zip', 'r') as zip_ref:
        zip_ref.extractall(nltk_data_path)

# Tell NLTK to use the local data path
nltk.data.path.append(nltk_data_path)


# ============ Load Models and Tokenizers ============
with open("logreg_model.pkl", "rb") as f:
    logreg_model = pickle.load(f)

with open("nb_model.pkl", "rb") as f:
    nb_model = pickle.load(f)

with open("tfidf_vectorizer.pkl", "rb") as f:
    tfidf_vectorizer = pickle.load(f)

with open("glove_tokenizer.pkl", "rb") as f:
    glove_tokenizer = pickle.load(f)

model_glove = tf.keras.models.load_model("glove_model.h5")

# ============ Constants ============
MAX_LENGTH = 300
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# ============ Preprocessing ============
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    text = text.replace('“', '').replace('”', '').replace("’", "'").replace("‘", "'")
    text = re.sub(r"'s\b", '', text)
    
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# ============ Prediction ============
def predict_ensemble(text):
    cleaned = clean_text(text)

    # Check if cleaned text is too short
    if len(cleaned.strip()) <= 10:
        return "Input too short to analyze."

    # TF-IDF-based predictions
    tfidf_vec = tfidf_vectorizer.transform([cleaned])
    prob_nb = nb_model.predict_proba(tfidf_vec)[0][1]
    prob_logreg = logreg_model.predict_proba(tfidf_vec)[0][1]

    # GloVe prediction
    glove_seq = glove_tokenizer.texts_to_sequences([cleaned])
    glove_pad = pad_sequences(glove_seq, maxlen=MAX_LENGTH, padding='post', truncating='post')
    prob_glove = model_glove.predict(glove_pad)[0][0]

    # Weighted ensemble
    ensemble_score = 0.50 * prob_nb + 0.1 * prob_logreg + 0.40 * prob_glove
    label = "✅ Real News" if ensemble_score >= 0.47 else "❌ Fake News"

    # Optional: Include probabilities
    # Naive Bayes:
    # Logistic Regression:
    # GloVe Model:

    explanation = f"""
**Model 1** {prob_nb:.4f}  
**Model 2** {prob_logreg:.4f}  
**Model 3** {prob_glove:.4f}  
**Ensemble Score:** {ensemble_score:.4f}  
**Final Prediction:** {label}
"""
    return explanation

# ============ Gradio Interface ============
interface = gr.Interface(
    fn=predict_ensemble,
    inputs=gr.Textbox(lines=8, placeholder="Paste your news article here...", label="News Article"),
    outputs=gr.Markdown(label="Prediction"),
    title="📰 Fake News Detector",
    description="This tool uses 3 models (Naive Bayes, Logistic Regression, GloVe-based Deep Learning) to classify news as real or fake using an ensemble method.",
    article="⚠️ **Disclaimer:** This demo is for educational and experimental purposes only. It is not suitable for real-world fact-checking or decision-making. Please do not rely on this tool.",
    allow_flagging="never" 
)

if __name__ == "__main__":
    interface.launch()