Spaces:
Sleeping
Sleeping
import os | |
import string | |
from collections import Counter | |
from typing import List, Tuple, Dict | |
import gradio as gr | |
import nltk | |
# ---------- NLTK bootstrap ---------- | |
def _ensure_nltk(): | |
# NLTK 3.9+ may require both 'punkt' and 'punkt_tab' | |
try: | |
nltk.data.find("tokenizers/punkt") | |
except LookupError: | |
nltk.download("punkt", quiet=True) | |
try: | |
nltk.data.find("tokenizers/punkt_tab") | |
except LookupError: | |
try: | |
nltk.download("punkt_tab", quiet=True) | |
except Exception: | |
pass # older NLTK doesn't have punkt_tab | |
try: | |
nltk.data.find("corpora/stopwords") | |
except LookupError: | |
nltk.download("stopwords", quiet=True) | |
_ensure_nltk() | |
from nltk.tokenize import sent_tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
# ---------- Helpers ---------- | |
def read_text_input(text: str, file_obj) -> Tuple[str, str]: | |
""" | |
Priority: if a file is provided, read it; otherwise use text box. | |
Supports .txt and .docx (not legacy .doc). | |
Returns (content, error_message). If error_message != "", content may be empty. | |
""" | |
if file_obj: | |
path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj)) | |
ext = os.path.splitext(path)[1].lower() | |
if ext == ".txt": | |
try: | |
with open(path, "r", encoding="utf-8", errors="ignore") as f: | |
return f.read(), "" | |
except Exception as e: | |
return "", f"β Error reading .txt: {e}" | |
elif ext == ".docx": | |
try: | |
from docx import Document | |
except Exception as e: | |
return "", f"β python-docx import failed: {e}. Did you install requirements?" | |
try: | |
doc = Document(path) | |
return "\n".join(p.text for p in doc.paragraphs), "" | |
except Exception as e: | |
return "", f"β Error reading .docx: {e}" | |
else: | |
return "", "β Unsupported file type. Please upload .txt or .docx (not legacy .doc)." | |
return (text or "", "") | |
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]: | |
""" | |
Clean mode: | |
- lowercase | |
- remove English stopwords | |
- remove punctuation tokens (.,?!;:"'()[]{}- etc.) | |
Raw mode (clean=False): | |
- return tokens unchanged | |
""" | |
if not clean: | |
return tokens | |
stops = set(stopwords.words("english")) | |
punct = set(string.punctuation) | |
return [ | |
t.lower() | |
for t in tokens | |
if t not in punct and t.lower() not in stops | |
] | |
def tokenize_pipeline( | |
raw_text: str, clean: bool | |
) -> Tuple[List[str], List[List[str]], Counter, List[str]]: | |
""" | |
- Split text into sentences | |
- Tokenize each sentence into words | |
- (Optionally) apply cleaning (lowercase, stopwords, punctuation removal) | |
- Build Bag of Words across the full text | |
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list | |
""" | |
if not raw_text.strip(): | |
return [], [], Counter(), [] | |
sentences = sent_tokenize(raw_text) | |
tokenized_sentences = [] | |
for s in sentences: | |
tokens = word_tokenize(s) | |
tokens = preprocess_tokens(tokens, clean=clean) | |
tokenized_sentences.append(tokens) | |
all_words = [w for sent in tokenized_sentences for w in sent] | |
bow = Counter(all_words) | |
vocabulary = sorted(bow.keys()) | |
return sentences, tokenized_sentences, bow, vocabulary | |
def build_sentence_vector( | |
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int | |
) -> Dict[str, int]: | |
""" | |
Count occurrences of each vocab term inside the selected sentence. | |
Returns {word: count} for non-zero entries, sorted by count desc then word. | |
""" | |
if not tokenized_sentences or not vocabulary: | |
return {} | |
if idx < 0 or idx >= len(tokenized_sentences): | |
return {} | |
sent_tokens = tokenized_sentences[idx] | |
counts = Counter(sent_tokens) | |
vector = {word: counts[word] for word in vocabulary if counts[word] > 0} | |
return dict(sorted(vector.items(), key=lambda kv: (-kv[1], kv[0]))) | |
# ---------- Gradio App ---------- | |
SAMPLE_TEXT = """NLTK is a powerful library for text processing. | |
Text processing is essential for NLP tasks. | |
Bag of Words is a fundamental concept in NLP. | |
Tokenization splits sentences into words. | |
We can count word occurrences in text. | |
Word frequency vectors represent sentences numerically. | |
Vectorization helps in transforming text for machine learning. | |
Machine learning models can use BOW as input. | |
NLP tasks include classification and sentiment analysis. | |
Word frequency counts provide insight into text structure. | |
""" | |
with gr.Blocks(title="NLTK: Tokenize β Bag of Words β Sentence Vector") as demo: | |
gr.Markdown( | |
""" | |
# NLTK Mini-Workbench | |
Type/paste text or drop a **.txt** / **.docx** file. | |
**Pipeline** | |
1) Install NLTK (auto-checked at startup) | |
2) Tokenize sentences into words | |
3) Count word occurrences (Bag of Words) | |
4) Build a word-frequency vector for any selected sentence | |
**Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults). | |
> Tip: Legacy `.doc` files are not supportedβplease convert to `.docx`. | |
""" | |
) | |
with gr.Row(): | |
text_in = gr.Textbox( | |
label="Input Text", | |
value=SAMPLE_TEXT, | |
lines=12, | |
placeholder="Paste text here, or upload a file instead...", | |
) | |
file_in = gr.File( | |
label="Or upload a file (.txt or .docx)", | |
file_types=[".txt", ".docx"], | |
type="filepath", | |
) | |
clean_opt = gr.Checkbox( | |
label="Stopword + lowercase + punctuation removal", | |
value=True, | |
info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").', | |
) | |
process_btn = gr.Button("Process", variant="primary") | |
# Hidden state to carry processed artifacts between events | |
st_sentences = gr.State([]) | |
st_tokenized = gr.State([]) | |
st_vocab = gr.State([]) | |
with gr.Row(): | |
sentence_dropdown = gr.Dropdown( | |
choices=[], | |
label="Select a sentence to vectorize", | |
interactive=True, | |
) | |
with gr.Tab("Tokenized Sentences"): | |
tokenized_out = gr.JSON(label="Tokens per sentence") | |
with gr.Tab("Bag of Words"): | |
bow_df = gr.Dataframe( | |
headers=["word", "count"], | |
label="Bag of Words (sorted by count desc)", | |
interactive=False, | |
) | |
with gr.Tab("Sentence Vector"): | |
vec_df = gr.Dataframe( | |
headers=["word", "count"], | |
label="Word-frequency vector for selected sentence", | |
interactive=False, | |
) | |
status_md = gr.Markdown("", label="Status / Errors") | |
# --------- Events --------- | |
def on_process(text, file, clean): | |
try: | |
_ensure_nltk() | |
raw_text, read_err = read_text_input(text, file) | |
if read_err: | |
return ( | |
gr.update(choices=[], value=None), | |
{}, | |
[], | |
[], | |
[], | |
[], | |
[], | |
f"**Status:** {read_err}", | |
) | |
sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean) | |
dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)] | |
dd_value = dd_choices[0] if dd_choices else None | |
tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)} | |
bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0])) | |
vector_rows = [] | |
if tokenized_sentences and vocab: | |
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0) | |
vector_rows = [[w, c] for w, c in vec_map.items()] | |
status = f"β Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}." | |
return ( | |
gr.update(choices=dd_choices, value=dd_value), | |
tokenized_json, | |
[[w, c] for w, c in bow_rows], | |
vector_rows, | |
sentences, | |
tokenized_sentences, | |
vocab, | |
status, | |
) | |
except LookupError as e: | |
return ( | |
gr.update(choices=[], value=None), | |
{}, | |
[], | |
[], | |
[], | |
[], | |
[], | |
f"β NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```", | |
) | |
except Exception as e: | |
return ( | |
gr.update(choices=[], value=None), | |
{}, | |
[], | |
[], | |
[], | |
[], | |
[], | |
f"β Unexpected error: {type(e).__name__}: {e}", | |
) | |
process_btn.click( | |
fn=on_process, | |
inputs=[text_in, file_in, clean_opt], | |
outputs=[ | |
sentence_dropdown, # dropdown choices + value | |
tokenized_out, # JSON tokens | |
bow_df, # BOW table | |
vec_df, # initial vector table | |
st_sentences, # state: sentences | |
st_tokenized, # state: tokenized sentences | |
st_vocab, # state: vocabulary | |
status_md, # status/errors | |
], | |
) | |
def on_select_sentence(choice: str, tokenized_sentences, vocabulary): | |
if not choice or not tokenized_sentences or not vocabulary: | |
return [] | |
try: | |
idx = int(choice.split(":")[0]) - 1 | |
except Exception: | |
return [] | |
vec_map = build_sentence_vector(tokenized_sentences, vocabulary, idx) | |
return [[w, c] for w, c in vec_map.items()] | |
sentence_dropdown.change( | |
fn=on_select_sentence, | |
inputs=[sentence_dropdown, st_tokenized, st_vocab], | |
outputs=[vec_df], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |