NLTK_Workflow / app.py
eaglelandsonce's picture
Update app.py
03316a2 verified
import io
import os
from typing import List, Tuple, Union
import gradio as gr
import nltk
# --- NLTK resources (cover both old & new names) -----------------------------
NLTK_PACKAGES = [
# Tokenizers
"punkt", "punkt_tab",
# Stopwords / Lemmas
"stopwords", "wordnet", "omw-1.4",
# POS taggers (old and new english-specific)
"averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
# NE chunkers (old and new)
"maxent_ne_chunker", "maxent_ne_chunker_tab",
# Word lists used by NE chunker
"words",
]
def ensure_nltk_resources() -> str:
messages = []
for pkg in NLTK_PACKAGES:
try:
# try to find generically
nltk.download(pkg, quiet=True) # idempotent
messages.append(f"OK: {pkg}")
except Exception as e:
messages.append(f"Failed {pkg}: {e}")
return " | ".join(messages) if messages else "Resources checked."
# Safe imports after downloads (works even if already present)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.chunk import ne_chunk
# --- Helpers -----------------------------------------------------------------
def _read_bytes(path: str) -> bytes:
with open(path, "rb") as f:
return f.read()
def read_file(upload: Union[str, gr.File]) -> str:
"""
Reads text from Gradio's File input. Supports .txt and .docx.
Works whether `upload` is a path (str) or a file-like with .name/.read().
"""
if upload is None:
return ""
# Normalize to path + bytes
if isinstance(upload, str):
path = upload
name = os.path.basename(path)
ext = os.path.splitext(name)[1].lower()
content = _read_bytes(path)
else:
# gradio might pass a tempfile object or dict-like
name = getattr(upload, "name", "") or ""
path = getattr(upload, "name", None)
ext = os.path.splitext(name)[1].lower()
try:
# Some envs require reading from disk instead of .read()
if path and os.path.exists(path):
content = _read_bytes(path)
else:
content = upload.read()
except Exception:
# last-resort: try path again
if path and os.path.exists(path):
content = _read_bytes(path)
else:
return "ERROR: Could not read uploaded file."
if ext == ".txt":
for enc in ("utf-8", "latin-1", "utf-16"):
try:
return content.decode(enc)
except UnicodeDecodeError:
continue
return "ERROR: Could not decode text file. Try UTF-8 or plain text."
if ext == ".docx":
try:
import docx # python-docx
except ImportError:
return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
f = io.BytesIO(content)
doc = docx.Document(f)
return "\n".join(p.text for p in doc.paragraphs)
return f"Unsupported file type: {ext}. Please upload .txt or .docx."
def extract_ner(ne_tree) -> List[Tuple[str, str]]:
entities = []
for subtree in ne_tree:
if hasattr(subtree, "label"):
label = subtree.label()
text = " ".join(token for token, _ in subtree.leaves())
entities.append((text, label))
return entities
# --- Core processing ----------------------------------------------------------
def process_text(raw_text: str, steps: List[str]) -> str:
if not raw_text or raw_text.strip() == "":
return "⚠️ No text provided."
# Make sure required resources exist (quietly)
ensure_nltk_resources()
report_lines = []
text = raw_text
tokens = None
filtered_tokens = None
stemmed_tokens = None
lemmatized_tokens = None
pos_tags_val = None
# 1) Tokenize (also needed by later steps)
if "Tokenize text." in steps or any(
s in steps for s in [
"Remove stopwords.", "Stem words.", "Lemmatize words.",
"Tag parts of speech.", "Extract named entities."
]
):
tokens = word_tokenize(text)
if "Tokenize text." in steps:
report_lines.append("### Tokens")
report_lines.append(f"`{tokens}`\n")
# 2) Stopwords
filtered_tokens = tokens
if "Remove stopwords." in steps:
sw = set(stopwords.words("english"))
filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
report_lines.append("### After Stopword Removal")
report_lines.append(f"`{filtered_tokens}`\n")
# 3) Stemming
stemmed_tokens = filtered_tokens
if "Stem words." in steps:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
report_lines.append("### Stemmed Tokens (Porter)")
report_lines.append(f"`{stemmed_tokens}`\n")
# 4) Lemmatization (use filtered tokens so lemmatization compares apples)
lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
if "Lemmatize words." in steps:
lemmatizer = WordNetLemmatizer()
# If you prefer POS-aware lemmas, we could pass pos=... after tagging
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
report_lines.append("### Lemmatized Tokens (WordNet)")
report_lines.append(f"`{lemmatized_tokens}`\n")
# 5) POS Tagging
if "Tag parts of speech." in steps or "Extract named entities." in steps:
base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
pos_tags_val = pos_tag(base_for_tagging)
if "Tag parts of speech." in steps:
report_lines.append("### Part-of-Speech Tags")
rows = ["| Token | POS |", "|---|---|"]
rows += [f"| {t} | {p} |" for (t, p) in pos_tags_val]
report_lines.append("\n".join(rows) + "\n")
# 6) NER
if "Extract named entities." in steps:
if not pos_tags_val:
base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
pos_tags_val = pos_tag(base_for_tagging)
ne_tree = ne_chunk(pos_tags_val, binary=False)
ner_pairs = extract_ner(ne_tree)
report_lines.append("### Named Entities")
if ner_pairs:
rows = ["| Entity | Label |", "|---|---|"]
rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
report_lines.append("\n".join(rows) + "\n")
else:
report_lines.append("_No named entities found._\n")
return "\n".join(report_lines).strip() or "No steps selected."
# --- Gradio UI ---------------------------------------------------------------
MENU = [
"Install and download required resources.",
"Tokenize text.",
"Remove stopwords.",
"Stem words.",
"Lemmatize words.",
"Tag parts of speech.",
"Extract named entities.",
]
DEFAULT_TEXT = (
"NLTK is a powerful library for text processing. "
"Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
)
with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
gr.Markdown("# NLTK Text Processing Toolkit")
gr.Markdown(
"Type or paste text, or drop a `.txt`/`.docx` file. "
"Select steps and click **Process**. Use **Install/Download Resources** if needed."
)
with gr.Row():
with gr.Column():
text_in = gr.Textbox(
label="Text Input",
lines=10,
value=DEFAULT_TEXT,
placeholder="Type or paste text here..."
)
file_in = gr.File(
label="...or drop a .txt / .docx file",
file_types=[".txt", ".docx"]
)
steps_in = gr.CheckboxGroup(
choices=MENU,
value=[
"Tokenize text.",
"Remove stopwords.",
"Lemmatize words.",
"Tag parts of speech.",
"Extract named entities.",
],
label="Menu (choose one or more)"
)
with gr.Row():
install_btn = gr.Button("Install/Download Resources")
process_btn = gr.Button("Process", variant="primary")
with gr.Column():
status_out = gr.Textbox(label="Status / Logs", interactive=False)
result_out = gr.Markdown(label="Results")
# Button callbacks
def on_install():
try:
return ensure_nltk_resources()
except Exception as e:
return f"Install error: {e}"
def on_process(text, file, steps):
try:
# Prefer typed text unless it's empty; otherwise use file
text = (text or "").strip()
file_text = read_file(file) if file is not None else ""
if not text and file_text:
text = file_text
if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
return file_text
return process_text(text, steps or [])
except Exception as e:
# Surface Python exceptions to the UI so it never looks like “nothing happened”
import traceback
return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"
install_btn.click(fn=on_install, inputs=None, outputs=status_out)
process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
# Optional: pre-download on load so first click never fails silently
demo.load(lambda: ensure_nltk_resources(), inputs=None, outputs=status_out)
if __name__ == "__main__":
demo.launch()