import os import string from collections import Counter from typing import List, Tuple, Dict import gradio as gr import nltk # ---------- NLTK bootstrap ---------- def _ensure_nltk(): # NLTK 3.9+ may require both 'punkt' and 'punkt_tab' try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt", quiet=True) try: nltk.data.find("tokenizers/punkt_tab") except LookupError: try: nltk.download("punkt_tab", quiet=True) except Exception: pass # older NLTK doesn't have punkt_tab try: nltk.data.find("corpora/stopwords") except LookupError: nltk.download("stopwords", quiet=True) _ensure_nltk() from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords # ---------- Helpers ---------- def read_text_input(text: str, file_obj) -> Tuple[str, str]: """ Priority: if a file is provided, read it; otherwise use text box. Supports .txt and .docx (not legacy .doc). Returns (content, error_message). If error_message != "", content may be empty. """ if file_obj: path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj)) ext = os.path.splitext(path)[1].lower() if ext == ".txt": try: with open(path, "r", encoding="utf-8", errors="ignore") as f: return f.read(), "" except Exception as e: return "", f"❌ Error reading .txt: {e}" elif ext == ".docx": try: from docx import Document except Exception as e: return "", f"❌ python-docx import failed: {e}. Did you install requirements?" try: doc = Document(path) return "\n".join(p.text for p in doc.paragraphs), "" except Exception as e: return "", f"❌ Error reading .docx: {e}" else: return "", "❌ Unsupported file type. Please upload .txt or .docx (not legacy .doc)." return (text or "", "") def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]: """ Clean mode: - lowercase - remove English stopwords - remove punctuation tokens (.,?!;:"'()[]{}- etc.) Raw mode (clean=False): - return tokens unchanged """ if not clean: return tokens stops = set(stopwords.words("english")) punct = set(string.punctuation) return [ t.lower() for t in tokens if t not in punct and t.lower() not in stops ] def tokenize_pipeline( raw_text: str, clean: bool ) -> Tuple[List[str], List[List[str]], Counter, List[str]]: """ - Split text into sentences - Tokenize each sentence into words - (Optionally) apply cleaning (lowercase, stopwords, punctuation removal) - Build Bag of Words across the full text Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list """ if not raw_text.strip(): return [], [], Counter(), [] sentences = sent_tokenize(raw_text) tokenized_sentences = [] for s in sentences: tokens = word_tokenize(s) tokens = preprocess_tokens(tokens, clean=clean) tokenized_sentences.append(tokens) all_words = [w for sent in tokenized_sentences for w in sent] bow = Counter(all_words) vocabulary = sorted(bow.keys()) return sentences, tokenized_sentences, bow, vocabulary def build_sentence_vector( tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int ) -> Dict[str, int]: """ Count occurrences of each vocab term inside the selected sentence. Returns {word: count} for non-zero entries, sorted by count desc then word. """ if not tokenized_sentences or not vocabulary: return {} if idx < 0 or idx >= len(tokenized_sentences): return {} sent_tokens = tokenized_sentences[idx] counts = Counter(sent_tokens) vector = {word: counts[word] for word in vocabulary if counts[word] > 0} return dict(sorted(vector.items(), key=lambda kv: (-kv[1], kv[0]))) # ---------- Gradio App ---------- SAMPLE_TEXT = """NLTK is a powerful library for text processing. Text processing is essential for NLP tasks. Bag of Words is a fundamental concept in NLP. Tokenization splits sentences into words. We can count word occurrences in text. Word frequency vectors represent sentences numerically. Vectorization helps in transforming text for machine learning. Machine learning models can use BOW as input. NLP tasks include classification and sentiment analysis. Word frequency counts provide insight into text structure. """ with gr.Blocks(title="NLTK: Tokenize → Bag of Words → Sentence Vector") as demo: gr.Markdown( """ # NLTK Mini-Workbench Type/paste text or drop a **.txt** / **.docx** file. **Pipeline** 1) Install NLTK (auto-checked at startup) 2) Tokenize sentences into words 3) Count word occurrences (Bag of Words) 4) Build a word-frequency vector for any selected sentence **Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults). > Tip: Legacy `.doc` files are not supported—please convert to `.docx`. """ ) with gr.Row(): text_in = gr.Textbox( label="Input Text", value=SAMPLE_TEXT, lines=12, placeholder="Paste text here, or upload a file instead...", ) file_in = gr.File( label="Or upload a file (.txt or .docx)", file_types=[".txt", ".docx"], type="filepath", ) clean_opt = gr.Checkbox( label="Stopword + lowercase + punctuation removal", value=True, info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").', ) process_btn = gr.Button("Process", variant="primary") # Hidden state to carry processed artifacts between events st_sentences = gr.State([]) st_tokenized = gr.State([]) st_vocab = gr.State([]) with gr.Row(): sentence_dropdown = gr.Dropdown( choices=[], label="Select a sentence to vectorize", interactive=True, ) with gr.Tab("Tokenized Sentences"): tokenized_out = gr.JSON(label="Tokens per sentence") with gr.Tab("Bag of Words"): bow_df = gr.Dataframe( headers=["word", "count"], label="Bag of Words (sorted by count desc)", interactive=False, ) with gr.Tab("Sentence Vector"): vec_df = gr.Dataframe( headers=["word", "count"], label="Word-frequency vector for selected sentence", interactive=False, ) status_md = gr.Markdown("", label="Status / Errors") # --------- Events --------- def on_process(text, file, clean): try: _ensure_nltk() raw_text, read_err = read_text_input(text, file) if read_err: return ( gr.update(choices=[], value=None), {}, [], [], [], [], [], f"**Status:** {read_err}", ) sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean) dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)] dd_value = dd_choices[0] if dd_choices else None tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)} bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0])) vector_rows = [] if tokenized_sentences and vocab: vec_map = build_sentence_vector(tokenized_sentences, vocab, 0) vector_rows = [[w, c] for w, c in vec_map.items()] status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}." return ( gr.update(choices=dd_choices, value=dd_value), tokenized_json, [[w, c] for w, c in bow_rows], vector_rows, sentences, tokenized_sentences, vocab, status, ) except LookupError as e: return ( gr.update(choices=[], value=None), {}, [], [], [], [], [], f"❌ NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```", ) except Exception as e: return ( gr.update(choices=[], value=None), {}, [], [], [], [], [], f"❌ Unexpected error: {type(e).__name__}: {e}", ) process_btn.click( fn=on_process, inputs=[text_in, file_in, clean_opt], outputs=[ sentence_dropdown, # dropdown choices + value tokenized_out, # JSON tokens bow_df, # BOW table vec_df, # initial vector table st_sentences, # state: sentences st_tokenized, # state: tokenized sentences st_vocab, # state: vocabulary status_md, # status/errors ], ) def on_select_sentence(choice: str, tokenized_sentences, vocabulary): if not choice or not tokenized_sentences or not vocabulary: return [] try: idx = int(choice.split(":")[0]) - 1 except Exception: return [] vec_map = build_sentence_vector(tokenized_sentences, vocabulary, idx) return [[w, c] for w, c in vec_map.items()] sentence_dropdown.change( fn=on_select_sentence, inputs=[sentence_dropdown, st_tokenized, st_vocab], outputs=[vec_df], ) if __name__ == "__main__": demo.launch()