"""
NLTK Text Processing Playground (Gradio)

Features
- Type text or drag-and-drop a .txt or .docx file
- Menu of steps:
    1) Install & download required NLTK resources
    2) Tokenize text
    3) Remove stopwords
    4) Stem words (Porter)
    5) Lemmatize words (WordNet)
    6) Tag parts of speech
    7) Extract named entities
- Prints results to screen for only the steps you select
"""

import io
import os
import re
from typing import List, Tuple, Optional

import gradio as gr

# --- NLTK imports are inside functions so the app can start even if resources aren't ready ---

SUPPORTED_EXTS = {".txt", ".docx"}

def read_text_from_inputs(text_input: str, file_obj: Optional[gr.File]) -> str:
    """
    Returns a single text string from either the text box or the uploaded file.
    If both are provided, file content takes precedence.
    """
    if file_obj is not None:
        name = getattr(file_obj, "name", None) or ""
        ext = os.path.splitext(name)[1].lower()
        file_bytes = file_obj.read()
        if ext == ".txt":
            try:
                return file_bytes.decode("utf-8")
            except Exception:
                # Fallback: best-effort decode
                return file_bytes.decode(errors="ignore")
        elif ext == ".docx":
            from docx import Document  # python-docx
            with io.BytesIO(file_bytes) as buf:
                doc = Document(buf)
                return "\n".join(p.text for p in doc.paragraphs)
        else:
            raise gr.Error(f"Unsupported file type: {ext}. Use one of: {', '.join(SUPPORTED_EXTS)}.")
    # fallback to text area
    return text_input or ""


def setup_nltk() -> str:
    """
    Installs/downloads the corpora and models needed for the lab.
    Safe to run multiple times; NLTK skips existing files.
    """
    import nltk
    downloaded = []
    for pkg in [
        "punkt",
        "stopwords",
        "wordnet",
        "averaged_perceptron_tagger",
        "maxent_ne_chunker",
        "words",
    ]:
        try:
            nltk.download(pkg, quiet=True)
            downloaded.append(pkg)
        except Exception as e:
            downloaded.append(f"{pkg} (error: {e})")
    return "NLTK resources ready:\n- " + "\n- ".join(downloaded)


def pipeline(text: str, steps: List[str]) -> str:
    """
    Runs the selected steps in a fixed logical order and returns a Markdown report.
    """
    if not text.strip():
        return "⚠️ **No input text found.** Type text or upload a .txt/.docx file."

    # Import here (after optional setup) to avoid early failures
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk import pos_tag
    from nltk.chunk import ne_chunk

    report_sections = []
    current_tokens = []
    filtered_tokens = []
    stemmed_tokens = []
    lemmatized_tokens = []

    # 1) Tokenize
    if "Tokenize text" in steps:
        current_tokens = word_tokenize(text)
        report_sections.append(
            "### 1) Tokens\n```\n" + repr(current_tokens) + "\n```"
        )

    # 2) Stopword removal (case-insensitive)
    if "Remove stopwords" in steps:
        if not current_tokens:
            current_tokens = word_tokenize(text)
        stop_words = set(stopwords.words("english"))
        filtered_tokens = [w for w in current_tokens if w.lower() not in stop_words]
        report_sections.append(
            "### 2) Filtered (stopwords removed)\n```\n" + repr(filtered_tokens) + "\n```"
        )

    # 3) Stem
    if "Stem words" in steps:
        if not filtered_tokens:
            # If user skipped stopwords, stem tokens directly
            base = current_tokens or word_tokenize(text)
        else:
            base = filtered_tokens
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(w) for w in base]
        report_sections.append(
            "### 3) Stemmed (Porter)\n```\n" + repr(stemmed_tokens) + "\n```"
        )

    # 4) Lemmatize
    if "Lemmatize words" in steps:
        if not filtered_tokens:
            base = current_tokens or word_tokenize(text)
        else:
            base = filtered_tokens
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in base]
        report_sections.append(
            "### 4) Lemmatized (WordNet, default POS=noun)\n```\n"
            + repr(lemmatized_tokens)
            + "\n```"
        )

    # Choose a reasonable token sequence for downstream steps
    downstream = (
        lemmatized_tokens
        or stemmed_tokens
        or filtered_tokens
        or current_tokens
        or word_tokenize(text)
    )

    # 5) POS tagging
    if "Tag parts of speech" in steps:
        tags = pos_tag(downstream)
        report_sections.append("### 5) POS Tags\n```\n" + repr(tags) + "\n```")

    # 6) Named entities
    if "Extract named entities" in steps:
        # ne_chunk expects POS-tagged input
        tagged = pos_tag(downstream)
        tree = ne_chunk(tagged)
        # Pretty-print the chunk tree as text
        report_sections.append(
            "### 6) Named Entities (chunk tree)\n```\n" + tree.pformat() + "\n```"
        )

    if not report_sections:
        return "ℹ️ **No steps selected.** Choose at least one item from the menu."

    header = "# NLTK Processing Report\n"
    return header + "\n\n".join(report_sections)


with gr.Blocks(title="NLTK Text Processing Playground") as demo:
    gr.Markdown(
        """
# NLTK Text Processing Playground

**Type text** _or_ **drop a `.txt` / `.docx` file**.  
Select the steps you want and click **Process**.  
If this is your first time, click **Prepare NLTK Resources**.

"""
    )

    with gr.Row():
        text_in = gr.Textbox(
            label="Text input",
            placeholder="Type or paste text here (or upload a file instead)…",
            lines=8,
        )
    file_in = gr.File(
        label="Optional: upload a .txt or .docx file",
        file_types=[".txt", ".docx"],
        file_count="single",
    )

    steps = gr.CheckboxGroup(
        choices=[
            "Tokenize text",
            "Remove stopwords",
            "Stem words",
            "Lemmatize words",
            "Tag parts of speech",
            "Extract named entities",
        ],
        value=["Tokenize text", "Remove stopwords", "Lemmatize words", "Tag parts of speech", "Extract named entities"],
        label="Menu — select one or more steps",
    )

    with gr.Row():
        setup_btn = gr.Button("🧰 Prepare NLTK Resources")
        run_btn = gr.Button("▶️ Process")

    output = gr.Markdown(label="Output")

    # Wire up actions
    setup_btn.click(fn=lambda: setup_nltk(), outputs=output)

    def run_pipeline(text_input, file_input, selected_steps):
        text = read_text_from_inputs(text_input, file_input)
        return pipeline(text, selected_steps)

    run_btn.click(
        fn=run_pipeline,
        inputs=[text_in, file_in, steps],
        outputs=output,
    )

if __name__ == "__main__":
    # Launch Gradio app. Share=False by default; set share=True if you want a public link.
    demo.launch()