Spaces:

eaglelandsonce
/

NLTK_Workflow

Sleeping

App Files Files Community

eaglelandsonce commited on 29 days ago

Commit

447225a

verified ·

1 Parent(s): f70a0ba

Create app.py

Browse files

Files changed (1) hide show

app.py +262 -0

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import io
+import os
+from typing import List, Tuple, Union
+import gradio as gr
+# --- NLTK setup --------------------------------------------------------------
+import nltk
+NLTK_PACKAGES = [
+    "punkt",
+    "stopwords",
+    "wordnet",
+    "omw-1.4",
+    "averaged_perceptron_tagger",
+    "maxent_ne_chunker",
+    "words",
+]
+def ensure_nltk_resources() -> str:
+    """
+    Download any missing NLTK resources needed for all menu actions.
+    """
+    messages = []
+    for pkg in NLTK_PACKAGES:
+        try:
+            nltk.data.find(pkg if "/" in pkg else f"tokenizers/{pkg}")
+            # Some resources live in other directories; attempt a general find:
+        except LookupError:
+            try:
+                nltk.download(pkg, quiet=True)
+                messages.append(f"Downloaded: {pkg}")
+            except Exception as e:
+                messages.append(f"Failed {pkg}: {e}")
+    # Validate with specific paths to avoid false positives
+    # (If a resource is still missing, try a broader search)
+    # Not strictly necessary—nltk.download usually suffices.
+    return " | ".join(messages) if messages else "All required resources already present."
+# Import after ensuring NLTK available (safe even if not downloaded yet)
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+from nltk import pos_tag
+from nltk.chunk import ne_chunk
+# --- Helpers ----------------------------------------------------------------
+def read_file(upload) -> str:
+    """
+    Reads text from a gradio UploadedFile. Supports .txt and .docx.
+    """
+    if upload is None:
+        return ""
+    name = getattr(upload, "name", "") or ""
+    ext = os.path.splitext(name)[1].lower()
+    # bytes content is at upload.read()
+    content = upload.read()
+    if ext == ".txt":
+        try:
+            return content.decode("utf-8")
+        except UnicodeDecodeError:
+            # Fallback to latin-1 if user throws random encodings at us
+            return content.decode("latin1")
+    elif ext == ".docx":
+        # Parse DOCX from bytes
+        try:
+            import docx  # python-docx
+        except ImportError:
+            return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
+        f = io.BytesIO(content)
+        doc = docx.Document(f)
+        return "\n".join(p.text for p in doc.paragraphs)
+    else:
+        return f"Unsupported file type: {ext}. Please upload .txt or .docx."
+def extract_ner(ne_tree) -> List[Tuple[str, str]]:
+    """
+    Convert an nltk.tree.Tree from ne_chunk into (entity_text, label) pairs.
+    """
+    entities = []
+    for subtree in ne_tree:
+        if hasattr(subtree, "label"):
+            label = subtree.label()
+            text = " ".join(token for token, _ in subtree.leaves())
+            entities.append((text, label))
+    return entities
+# --- Core processing ---------------------------------------------------------
+def process_text(
+    raw_text: str,
+    steps: List[str]
+) -> str:
+    """
+    Run selected processing steps and return a markdown report.
+    """
+    if not raw_text or raw_text.strip() == "":
+        return "⚠️ No text provided."
+    report_lines = []
+    text = raw_text
+    # Ensure resources if user forgot to click "Install/Download"
+    try:
+        # Probe a couple of resources; if missing, auto-download silently
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        ensure_nltk_resources()
+    tokens = None
+    filtered_tokens = None
+    stemmed_tokens = None
+    lemmatized_tokens = None
+    pos_tags = None
+    ner_pairs = None
+    # 1) Tokenize
+    if "Tokenize text." in steps or any(s in steps for s in ["Remove stopwords.", "Stem words.", "Lemmatize words.", "Tag parts of speech.", "Extract named entities."]):
+        tokens = word_tokenize(text)
+        if "Tokenize text." in steps:
+            report_lines.append("### Tokens")
+            report_lines.append(f"`{tokens}`\n")
+    # 2) Stopwords
+    if "Remove stopwords." in steps:
+        sw = set(stopwords.words("english"))
+        filtered_tokens = [w for w in tokens if w.lower() not in sw]
+        report_lines.append("### After Stopword Removal")
+        report_lines.append(f"`{filtered_tokens}`\n")
+    else:
+        filtered_tokens = tokens
+    # 3) Stemming
+    if "Stem words." in steps:
+        stemmer = PorterStemmer()
+        stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
+        report_lines.append("### Stemmed Tokens (Porter)")
+        report_lines.append(f"`{stemmed_tokens}`\n")
+    else:
+        stemmed_tokens = filtered_tokens
+    # 4) Lemmatization
+    if "Lemmatize words." in steps:
+        lemmatizer = WordNetLemmatizer()
+        lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
+        report_lines.append("### Lemmatized Tokens (WordNet)")
+        report_lines.append(f"`{lemmatized_tokens}`\n")
+    else:
+        lemmatized_tokens = stemmed_tokens or filtered_tokens
+    # 5) POS Tagging
+    if "Tag parts of speech." in steps or "Extract named entities." in steps:
+        base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
+        pos_tags = pos_tag(base_for_tagging)
+        if "Tag parts of speech." in steps:
+            report_lines.append("### Part-of-Speech Tags")
+            # Pretty table
+            rows = ["| Token | POS |", "|---|---|"]
+            rows += [f"| {t} | {p} |" for (t, p) in pos_tags]
+            report_lines.append("\n".join(rows) + "\n")
+    # 6) NER
+    if "Extract named entities." in steps:
+        if not pos_tags:
+            pos_tags = pos_tag(lemmatized_tokens if lemmatized_tokens else (tokens or []))
+        ne_tree = ne_chunk(pos_tags, binary=False)
+        ner_pairs = extract_ner(ne_tree)
+        report_lines.append("### Named Entities")
+        if ner_pairs:
+            rows = ["| Entity | Label |", "|---|---|"]
+            rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
+            report_lines.append("\n".join(rows) + "\n")
+        else:
+            report_lines.append("_No named entities found._\n")
+    # Final markdown
+    return "\n".join(report_lines).strip() or "No steps selected."
+# --- Gradio UI ---------------------------------------------------------------
+MENU = [
+    "Install and download required resources.",
+    "Tokenize text.",
+    "Remove stopwords.",
+    "Stem words.",
+    "Lemmatize words.",
+    "Tag parts of speech.",
+    "Extract named entities.",
+]
+DEFAULT_TEXT = (
+    "NLTK is a powerful library for text processing. "
+    "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
+)
+with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
+    gr.Markdown("# NLTK Text Processing Toolkit")
+    gr.Markdown(
+        "Type or paste text, or drag a `.txt` / `.docx` file. "
+        "Select the steps to run, then click **Process**. "
+        "Use **Install/Download Resources** once if needed."
+    )
+    with gr.Row():
+        with gr.Column():
+            text_in = gr.Textbox(
+                label="Text Input",
+                lines=10,
+                value=DEFAULT_TEXT,
+                placeholder="Type or paste text here..."
+            )
+            file_in = gr.File(
+                label="...or drop a .txt / .docx file",
+                file_types=[".txt", ".docx"]
+            )
+            steps_in = gr.CheckboxGroup(
+                choices=MENU,
+                value=[
+                    "Tokenize text.",
+                    "Remove stopwords.",
+                    "Lemmatize words.",
+                    "Tag parts of speech.",
+                    "Extract named entities.",
+                ],
+                label="Menu (choose one or more)"
+            )
+            with gr.Row():
+                install_btn = gr.Button("Install/Download Resources")
+                process_btn = gr.Button("Process", variant="primary")
+        with gr.Column():
+            status_out = gr.Textbox(label="Status", interactive=False)
+            result_out = gr.Markdown(label="Results")
+    # Button callbacks
+    def on_install():
+        return ensure_nltk_resources()
+    def on_process(text, file, steps):
+        # If user provided a file, prefer file content unless text has content
+        file_text = read_file(file) if file is not None else ""
+        # If file_text indicates an error, show it
+        if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
+            return file_text
+        final_text = text.strip() if (text and text.strip()) else file_text
+        return process_text(final_text, steps or [])
+    install_btn.click(fn=on_install, inputs=None, outputs=status_out)
+    process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
+if __name__ == "__main__":
+    # You can customize server_name/port if deploying remotely
+    demo.launch()