""" NLTK Text Processing Playground (Gradio) Features - Type text or drag-and-drop a .txt or .docx file - Menu of steps: 1) Install & download required NLTK resources 2) Tokenize text 3) Remove stopwords 4) Stem words (Porter) 5) Lemmatize words (WordNet) 6) Tag parts of speech 7) Extract named entities - Prints results to screen for only the steps you select """ import io import os import re from typing import List, Tuple, Optional import gradio as gr # --- NLTK imports are inside functions so the app can start even if resources aren't ready --- SUPPORTED_EXTS = {".txt", ".docx"} def read_text_from_inputs(text_input: str, file_obj: Optional[gr.File]) -> str: """ Returns a single text string from either the text box or the uploaded file. If both are provided, file content takes precedence. """ if file_obj is not None: name = getattr(file_obj, "name", None) or "" ext = os.path.splitext(name)[1].lower() file_bytes = file_obj.read() if ext == ".txt": try: return file_bytes.decode("utf-8") except Exception: # Fallback: best-effort decode return file_bytes.decode(errors="ignore") elif ext == ".docx": from docx import Document # python-docx with io.BytesIO(file_bytes) as buf: doc = Document(buf) return "\n".join(p.text for p in doc.paragraphs) else: raise gr.Error(f"Unsupported file type: {ext}. Use one of: {', '.join(SUPPORTED_EXTS)}.") # fallback to text area return text_input or "" def setup_nltk() -> str: """ Installs/downloads the corpora and models needed for the lab. Safe to run multiple times; NLTK skips existing files. """ import nltk downloaded = [] for pkg in [ "punkt", "stopwords", "wordnet", "averaged_perceptron_tagger", "maxent_ne_chunker", "words", ]: try: nltk.download(pkg, quiet=True) downloaded.append(pkg) except Exception as e: downloaded.append(f"{pkg} (error: {e})") return "NLTK resources ready:\n- " + "\n- ".join(downloaded) def pipeline(text: str, steps: List[str]) -> str: """ Runs the selected steps in a fixed logical order and returns a Markdown report. """ if not text.strip(): return "⚠️ **No input text found.** Type text or upload a .txt/.docx file." # Import here (after optional setup) to avoid early failures import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk import pos_tag from nltk.chunk import ne_chunk report_sections = [] current_tokens = [] filtered_tokens = [] stemmed_tokens = [] lemmatized_tokens = [] # 1) Tokenize if "Tokenize text" in steps: current_tokens = word_tokenize(text) report_sections.append( "### 1) Tokens\n```\n" + repr(current_tokens) + "\n```" ) # 2) Stopword removal (case-insensitive) if "Remove stopwords" in steps: if not current_tokens: current_tokens = word_tokenize(text) stop_words = set(stopwords.words("english")) filtered_tokens = [w for w in current_tokens if w.lower() not in stop_words] report_sections.append( "### 2) Filtered (stopwords removed)\n```\n" + repr(filtered_tokens) + "\n```" ) # 3) Stem if "Stem words" in steps: if not filtered_tokens: # If user skipped stopwords, stem tokens directly base = current_tokens or word_tokenize(text) else: base = filtered_tokens stemmer = PorterStemmer() stemmed_tokens = [stemmer.stem(w) for w in base] report_sections.append( "### 3) Stemmed (Porter)\n```\n" + repr(stemmed_tokens) + "\n```" ) # 4) Lemmatize if "Lemmatize words" in steps: if not filtered_tokens: base = current_tokens or word_tokenize(text) else: base = filtered_tokens lemmatizer = WordNetLemmatizer() lemmatized_tokens = [lemmatizer.lemmatize(w) for w in base] report_sections.append( "### 4) Lemmatized (WordNet, default POS=noun)\n```\n" + repr(lemmatized_tokens) + "\n```" ) # Choose a reasonable token sequence for downstream steps downstream = ( lemmatized_tokens or stemmed_tokens or filtered_tokens or current_tokens or word_tokenize(text) ) # 5) POS tagging if "Tag parts of speech" in steps: tags = pos_tag(downstream) report_sections.append("### 5) POS Tags\n```\n" + repr(tags) + "\n```") # 6) Named entities if "Extract named entities" in steps: # ne_chunk expects POS-tagged input tagged = pos_tag(downstream) tree = ne_chunk(tagged) # Pretty-print the chunk tree as text report_sections.append( "### 6) Named Entities (chunk tree)\n```\n" + tree.pformat() + "\n```" ) if not report_sections: return "ℹ️ **No steps selected.** Choose at least one item from the menu." header = "# NLTK Processing Report\n" return header + "\n\n".join(report_sections) with gr.Blocks(title="NLTK Text Processing Playground") as demo: gr.Markdown( """ # NLTK Text Processing Playground **Type text** _or_ **drop a `.txt` / `.docx` file**. Select the steps you want and click **Process**. If this is your first time, click **Prepare NLTK Resources**. """ ) with gr.Row(): text_in = gr.Textbox( label="Text input", placeholder="Type or paste text here (or upload a file instead)…", lines=8, ) file_in = gr.File( label="Optional: upload a .txt or .docx file", file_types=[".txt", ".docx"], file_count="single", ) steps = gr.CheckboxGroup( choices=[ "Tokenize text", "Remove stopwords", "Stem words", "Lemmatize words", "Tag parts of speech", "Extract named entities", ], value=["Tokenize text", "Remove stopwords", "Lemmatize words", "Tag parts of speech", "Extract named entities"], label="Menu — select one or more steps", ) with gr.Row(): setup_btn = gr.Button("🧰 Prepare NLTK Resources") run_btn = gr.Button("▶️ Process") output = gr.Markdown(label="Output") # Wire up actions setup_btn.click(fn=lambda: setup_nltk(), outputs=output) def run_pipeline(text_input, file_input, selected_steps): text = read_text_from_inputs(text_input, file_input) return pipeline(text, selected_steps) run_btn.click( fn=run_pipeline, inputs=[text_in, file_in, steps], outputs=output, ) if __name__ == "__main__": # Launch Gradio app. Share=False by default; set share=True if you want a public link. demo.launch()