Spaces:

eaglelandsonce
/

NLTK_Workflow

Sleeping

App Files Files Community

NLTK_Workflow / app.py

eaglelandsonce

Update app.py

03316a2 verified about 1 month ago

raw

history blame contribute delete

9.93 kB

	import io
	import os
	from typing import List, Tuple, Union

	import gradio as gr
	import nltk

	# --- NLTK resources (cover both old & new names) -----------------------------
	NLTK_PACKAGES = [
	# Tokenizers
	"punkt", "punkt_tab",
	# Stopwords / Lemmas
	"stopwords", "wordnet", "omw-1.4",
	# POS taggers (old and new english-specific)
	"averaged_perceptron_tagger", "averaged_perceptron_tagger_eng",
	# NE chunkers (old and new)
	"maxent_ne_chunker", "maxent_ne_chunker_tab",
	# Word lists used by NE chunker
	"words",
	]

	def ensure_nltk_resources() -> str:
	messages = []
	for pkg in NLTK_PACKAGES:
	try:
	# try to find generically
	nltk.download(pkg, quiet=True) # idempotent
	messages.append(f"OK: {pkg}")
	except Exception as e:
	messages.append(f"Failed {pkg}: {e}")
	return " \| ".join(messages) if messages else "Resources checked."

	# Safe imports after downloads (works even if already present)
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer, WordNetLemmatizer
	from nltk import pos_tag
	from nltk.chunk import ne_chunk


	# --- Helpers -----------------------------------------------------------------
	def _read_bytes(path: str) -> bytes:
	with open(path, "rb") as f:
	return f.read()

	def read_file(upload: Union[str, gr.File]) -> str:
	"""
	Reads text from Gradio's File input. Supports .txt and .docx.
	Works whether `upload` is a path (str) or a file-like with .name/.read().
	"""
	if upload is None:
	return ""

	# Normalize to path + bytes
	if isinstance(upload, str):
	path = upload
	name = os.path.basename(path)
	ext = os.path.splitext(name)[1].lower()
	content = _read_bytes(path)
	else:
	# gradio might pass a tempfile object or dict-like
	name = getattr(upload, "name", "") or ""
	path = getattr(upload, "name", None)
	ext = os.path.splitext(name)[1].lower()
	try:
	# Some envs require reading from disk instead of .read()
	if path and os.path.exists(path):
	content = _read_bytes(path)
	else:
	content = upload.read()
	except Exception:
	# last-resort: try path again
	if path and os.path.exists(path):
	content = _read_bytes(path)
	else:
	return "ERROR: Could not read uploaded file."

	if ext == ".txt":
	for enc in ("utf-8", "latin-1", "utf-16"):
	try:
	return content.decode(enc)
	except UnicodeDecodeError:
	continue
	return "ERROR: Could not decode text file. Try UTF-8 or plain text."

	if ext == ".docx":
	try:
	import docx # python-docx
	except ImportError:
	return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
	f = io.BytesIO(content)
	doc = docx.Document(f)
	return "\n".join(p.text for p in doc.paragraphs)

	return f"Unsupported file type: {ext}. Please upload .txt or .docx."


	def extract_ner(ne_tree) -> List[Tuple[str, str]]:
	entities = []
	for subtree in ne_tree:
	if hasattr(subtree, "label"):
	label = subtree.label()
	text = " ".join(token for token, _ in subtree.leaves())
	entities.append((text, label))
	return entities


	# --- Core processing ----------------------------------------------------------
	def process_text(raw_text: str, steps: List[str]) -> str:
	if not raw_text or raw_text.strip() == "":
	return "⚠️ No text provided."

	# Make sure required resources exist (quietly)
	ensure_nltk_resources()

	report_lines = []
	text = raw_text

	tokens = None
	filtered_tokens = None
	stemmed_tokens = None
	lemmatized_tokens = None
	pos_tags_val = None

	# 1) Tokenize (also needed by later steps)
	if "Tokenize text." in steps or any(
	s in steps for s in [
	"Remove stopwords.", "Stem words.", "Lemmatize words.",
	"Tag parts of speech.", "Extract named entities."
	]
	):
	tokens = word_tokenize(text)
	if "Tokenize text." in steps:
	report_lines.append("### Tokens")
	report_lines.append(f"`{tokens}`\n")

	# 2) Stopwords
	filtered_tokens = tokens
	if "Remove stopwords." in steps:
	sw = set(stopwords.words("english"))
	filtered_tokens = [w for w in (tokens or []) if w.lower() not in sw]
	report_lines.append("### After Stopword Removal")
	report_lines.append(f"`{filtered_tokens}`\n")

	# 3) Stemming
	stemmed_tokens = filtered_tokens
	if "Stem words." in steps:
	stemmer = PorterStemmer()
	stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
	report_lines.append("### Stemmed Tokens (Porter)")
	report_lines.append(f"`{stemmed_tokens}`\n")

	# 4) Lemmatization (use filtered tokens so lemmatization compares apples)
	lemmatized_tokens = stemmed_tokens if stemmed_tokens is not None else filtered_tokens
	if "Lemmatize words." in steps:
	lemmatizer = WordNetLemmatizer()
	# If you prefer POS-aware lemmas, we could pass pos=... after tagging
	lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
	report_lines.append("### Lemmatized Tokens (WordNet)")
	report_lines.append(f"`{lemmatized_tokens}`\n")

	# 5) POS Tagging
	if "Tag parts of speech." in steps or "Extract named entities." in steps:
	base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
	pos_tags_val = pos_tag(base_for_tagging)
	if "Tag parts of speech." in steps:
	report_lines.append("### Part-of-Speech Tags")
	rows = ["\| Token \| POS \|", "\|---\|---\|"]
	rows += [f"\| {t} \| {p} \|" for (t, p) in pos_tags_val]
	report_lines.append("\n".join(rows) + "\n")

	# 6) NER
	if "Extract named entities." in steps:
	if not pos_tags_val:
	base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
	pos_tags_val = pos_tag(base_for_tagging)
	ne_tree = ne_chunk(pos_tags_val, binary=False)
	ner_pairs = extract_ner(ne_tree)

	report_lines.append("### Named Entities")
	if ner_pairs:
	rows = ["\| Entity \| Label \|", "\|---\|---\|"]
	rows += [f"\| {ent} \| {lbl} \|" for (ent, lbl) in ner_pairs]
	report_lines.append("\n".join(rows) + "\n")
	else:
	report_lines.append("_No named entities found._\n")

	return "\n".join(report_lines).strip() or "No steps selected."


	# --- Gradio UI ---------------------------------------------------------------
	MENU = [
	"Install and download required resources.",
	"Tokenize text.",
	"Remove stopwords.",
	"Stem words.",
	"Lemmatize words.",
	"Tag parts of speech.",
	"Extract named entities.",
	]

	DEFAULT_TEXT = (
	"NLTK is a powerful library for text processing. "
	"Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
	)

	with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
	gr.Markdown("# NLTK Text Processing Toolkit")
	gr.Markdown(
	"Type or paste text, or drop a `.txt`/`.docx` file. "
	"Select steps and click Process. Use Install/Download Resources if needed."
	)

	with gr.Row():
	with gr.Column():
	text_in = gr.Textbox(
	label="Text Input",
	lines=10,
	value=DEFAULT_TEXT,
	placeholder="Type or paste text here..."
	)
	file_in = gr.File(
	label="...or drop a .txt / .docx file",
	file_types=[".txt", ".docx"]
	)
	steps_in = gr.CheckboxGroup(
	choices=MENU,
	value=[
	"Tokenize text.",
	"Remove stopwords.",
	"Lemmatize words.",
	"Tag parts of speech.",
	"Extract named entities.",
	],
	label="Menu (choose one or more)"
	)
	with gr.Row():
	install_btn = gr.Button("Install/Download Resources")
	process_btn = gr.Button("Process", variant="primary")

	with gr.Column():
	status_out = gr.Textbox(label="Status / Logs", interactive=False)
	result_out = gr.Markdown(label="Results")

	# Button callbacks
	def on_install():
	try:
	return ensure_nltk_resources()
	except Exception as e:
	return f"Install error: {e}"

	def on_process(text, file, steps):
	try:
	# Prefer typed text unless it's empty; otherwise use file
	text = (text or "").strip()
	file_text = read_file(file) if file is not None else ""
	if not text and file_text:
	text = file_text

	if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
	return file_text

	return process_text(text, steps or [])
	except Exception as e:
	# Surface Python exceptions to the UI so it never looks like “nothing happened”
	import traceback
	return "### Error\n```\n" + "".join(traceback.format_exc()) + "\n```"

	install_btn.click(fn=on_install, inputs=None, outputs=status_out)
	process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)

	# Optional: pre-download on load so first click never fails silently
	demo.load(lambda: ensure_nltk_resources(), inputs=None, outputs=status_out)

	if __name__ == "__main__":
	demo.launch()