Spaces:

eaglelandsonce
/

NLTK_Workflow

Running

File size: 9,931 Bytes

import io
import os
from typing import List, Tuple, Union

import gradio as gr
import nltk

# --- NLTK resources (cover both old & new names) -----------------------------
NLTK_PACKAGES = [
    # Tokenizers
    "punkt", "punkt_tab",
    # Stopwords / Lemmas
    "stopwords", "wordnet", "omw-1.4",
    # POS taggers (old and new english-specific)
    "averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
    # NE chunkers (old and new)
    "maxent_ne_chunker", "maxent_ne_chunker_tab",
    # Word lists used by NE chunker
    "words",
]

def ensure_nltk_resources() -> str:
    messages = []
    for pkg in NLTK_PACKAGES:
        try:
            # try to find generically
            nltk.download(pkg, quiet=True)  # idempotent
            messages.append(f"OK: {pkg}")
        except Exception as e:
            messages.append(f"Failed {pkg}: {e}")
    return " | ".join(messages) if messages else "Resources checked."

# Safe imports after downloads (works even if already present)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.chunk import ne_chunk


# --- Helpers -----------------------------------------------------------------
def _read_bytes(path: str) -> bytes:
    with open(path, "rb") as f:
        return f.read()

def read_file(upload: Union[str, gr.File]) -> str:
    """
    Reads text from Gradio's File input. Supports .txt and .docx.
    Works whether `upload` is a path (str) or a file-like with .name/.read().
    """
    if upload is None:
        return ""

    # Normalize to path + bytes
    if isinstance(upload, str):
        path = upload
        name = os.path.basename(path)
        ext = os.path.splitext(name)[1].lower()
        content = _read_bytes(path)
    else:
        # gradio might pass a tempfile object or dict-like
        name = getattr(upload, "name", "") or ""
        path = getattr(upload, "name", None)
        ext = os.path.splitext(name)[1].lower()
        try:
            # Some envs require reading from disk instead of .read()
            if path and os.path.exists(path):
                content = _read_bytes(path)
            else:
                content = upload.read()
        except Exception:
            # last-resort: try path again
            if path and os.path.exists(path):
                content = _read_bytes(path)
            else:
                return "ERROR: Could not read uploaded file."

    if ext == ".txt":
        for enc in ("utf-8", "latin-1", "utf-16"):
            try:
                return content.decode(enc)
            except UnicodeDecodeError:
                continue
        return "ERROR: Could not decode text file. Try UTF-8 or plain text."

    if ext == ".docx":
        try:
            import docx  # python-docx
        except ImportError:
            return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
        f = io.BytesIO(content)
        doc = docx.Document(f)
        return "\n".join(p.text for p in doc.paragraphs)

    return f"Unsupported file type: {ext}. Please upload .txt or .docx."


def extract_ner(ne_tree) -> List[Tuple[str, str]]:
    entities = []
    for subtree in ne_tree:
        if hasattr(subtree, "label"):
            label = subtree.label()
            text = " ".join(token for token, _ in subtree.leaves())
            entities.append((text, label))
    return entities


# --- Core processing ----------------------------------------------------------
def process_text(raw_text: str, steps: List[str]) -> str:
    if not raw_text or raw_text.strip() == "":
        return "⚠️ No text provided."

    # Make sure required resources exist (quietly)
    ensure_nltk_resources()

    report_lines = []
    text = raw_text

    tokens = None
    filtered_tokens = None
    stemmed_tokens = None
    lemmatized_tokens = None
    pos_tags_val = None

    # 1) Tokenize (also needed by later steps)
    if "Tokenize text." in steps or any(
        s in steps for s in [
            "Remove stopwords.", "Stem words.", "Lemmatize words.",
            "Tag parts of speech.", "Extract named entities."
        ]
    ):
        tokens = word_tokenize(text)
        if "Tokenize text." in steps:
            report_lines.append("### Tokens")
            report_lines.append(f"`{tokens}`\n")

    # 2) Stopwords
    filtered_tokens = tokens
    if "Remove stopwords." in steps:
        sw = set(stopwords.words("english"))
        filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
        report_lines.append("### After Stopword Removal")
        report_lines.append(f"`{filtered_tokens}`\n")

    # 3) Stemming
    stemmed_tokens = filtered_tokens
    if "Stem words." in steps:
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
        report_lines.append("### Stemmed Tokens (Porter)")
        report_lines.append(f"`{stemmed_tokens}`\n")

    # 4) Lemmatization (use filtered tokens so lemmatization compares apples)
    lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
    if "Lemmatize words." in steps:
        lemmatizer = WordNetLemmatizer()
        # If you prefer POS-aware lemmas, we could pass pos=... after tagging
        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
        report_lines.append("### Lemmatized Tokens (WordNet)")
        report_lines.append(f"`{lemmatized_tokens}`\n")

    # 5) POS Tagging
    if "Tag parts of speech." in steps or "Extract named entities." in steps:
        base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
        pos_tags_val = pos_tag(base_for_tagging)
        if "Tag parts of speech." in steps:
            report_lines.append("### Part-of-Speech Tags")
            rows = ["| Token | POS |", "|---|---|"]
            rows += [f"| {t} | {p} |" for (t, p) in pos_tags_val]
            report_lines.append("\n".join(rows) + "\n")

    # 6) NER
    if "Extract named entities." in steps:
        if not pos_tags_val:
            base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
            pos_tags_val = pos_tag(base_for_tagging)
        ne_tree = ne_chunk(pos_tags_val, binary=False)
        ner_pairs = extract_ner(ne_tree)

        report_lines.append("### Named Entities")
        if ner_pairs:
            rows = ["| Entity | Label |", "|---|---|"]
            rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
            report_lines.append("\n".join(rows) + "\n")
        else:
            report_lines.append("_No named entities found._\n")

    return "\n".join(report_lines).strip() or "No steps selected."


# --- Gradio UI ---------------------------------------------------------------
MENU = [
    "Install and download required resources.",
    "Tokenize text.",
    "Remove stopwords.",
    "Stem words.",
    "Lemmatize words.",
    "Tag parts of speech.",
    "Extract named entities.",
]

DEFAULT_TEXT = (
    "NLTK is a powerful library for text processing. "
    "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
)

with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
    gr.Markdown("# NLTK Text Processing Toolkit")
    gr.Markdown(
        "Type or paste text, or drop a `.txt`/`.docx` file. "
        "Select steps and click **Process**. Use **Install/Download Resources** if needed."
    )

    with gr.Row():
        with gr.Column():
            text_in = gr.Textbox(
                label="Text Input",
                lines=10,
                value=DEFAULT_TEXT,
                placeholder="Type or paste text here..."
            )
            file_in = gr.File(
                label="...or drop a .txt / .docx file",
                file_types=[".txt", ".docx"]
            )
            steps_in = gr.CheckboxGroup(
                choices=MENU,
                value=[
                    "Tokenize text.",
                    "Remove stopwords.",
                    "Lemmatize words.",
                    "Tag parts of speech.",
                    "Extract named entities.",
                ],
                label="Menu (choose one or more)"
            )
            with gr.Row():
                install_btn = gr.Button("Install/Download Resources")
                process_btn = gr.Button("Process", variant="primary")

        with gr.Column():
            status_out = gr.Textbox(label="Status / Logs", interactive=False)
            result_out = gr.Markdown(label="Results")

    # Button callbacks
    def on_install():
        try:
            return ensure_nltk_resources()
        except Exception as e:
            return f"Install error: {e}"

    def on_process(text, file, steps):
        try:
            # Prefer typed text unless it's empty; otherwise use file
            text = (text or "").strip()
            file_text = read_file(file) if file is not None else ""
            if not text and file_text:
                text = file_text

            if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
                return file_text

            return process_text(text, steps or [])
        except Exception as e:
            # Surface Python exceptions to the UI so it never looks like “nothing happened”
            import traceback
            return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"

    install_btn.click(fn=on_install, inputs=None, outputs=status_out)
    process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)

    # Optional: pre-download on load so first click never fails silently
    demo.load(lambda: ensure_nltk_resources(), inputs=None, outputs=status_out)

if __name__ == "__main__":
    demo.launch()