BOW_Workflow / app.py
eaglelandsonce's picture
Update app.py
6d1e3da verified
import os
import string
from collections import Counter
from typing import List, Tuple, Dict
import gradio as gr
import nltk
# ---------- NLTK bootstrap ----------
def _ensure_nltk():
# NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", quiet=True)
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
try:
nltk.download("punkt_tab", quiet=True)
except Exception:
pass # older NLTK doesn't have punkt_tab
try:
nltk.data.find("corpora/stopwords")
except LookupError:
nltk.download("stopwords", quiet=True)
_ensure_nltk()
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
# ---------- Helpers ----------
def read_text_input(text: str, file_obj) -> Tuple[str, str]:
"""
Priority: if a file is provided, read it; otherwise use text box.
Supports .txt and .docx (not legacy .doc).
Returns (content, error_message). If error_message != "", content may be empty.
"""
if file_obj:
path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj))
ext = os.path.splitext(path)[1].lower()
if ext == ".txt":
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read(), ""
except Exception as e:
return "", f"❌ Error reading .txt: {e}"
elif ext == ".docx":
try:
from docx import Document
except Exception as e:
return "", f"❌ python-docx import failed: {e}. Did you install requirements?"
try:
doc = Document(path)
return "\n".join(p.text for p in doc.paragraphs), ""
except Exception as e:
return "", f"❌ Error reading .docx: {e}"
else:
return "", "❌ Unsupported file type. Please upload .txt or .docx (not legacy .doc)."
return (text or "", "")
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
"""
Clean mode:
- lowercase
- remove English stopwords
- remove punctuation tokens (.,?!;:"'()[]{}- etc.)
Raw mode (clean=False):
- return tokens unchanged
"""
if not clean:
return tokens
stops = set(stopwords.words("english"))
punct = set(string.punctuation)
return [
t.lower()
for t in tokens
if t not in punct and t.lower() not in stops
]
def tokenize_pipeline(
raw_text: str, clean: bool
) -> Tuple[List[str], List[List[str]], Counter, List[str]]:
"""
- Split text into sentences
- Tokenize each sentence into words
- (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
- Build Bag of Words across the full text
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
"""
if not raw_text.strip():
return [], [], Counter(), []
sentences = sent_tokenize(raw_text)
tokenized_sentences = []
for s in sentences:
tokens = word_tokenize(s)
tokens = preprocess_tokens(tokens, clean=clean)
tokenized_sentences.append(tokens)
all_words = [w for sent in tokenized_sentences for w in sent]
bow = Counter(all_words)
vocabulary = sorted(bow.keys())
return sentences, tokenized_sentences, bow, vocabulary
def build_sentence_vector(
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
) -> Dict[str, int]:
"""
Count occurrences of each vocab term inside the selected sentence.
Returns {word: count} for non-zero entries, sorted by count desc then word.
"""
if not tokenized_sentences or not vocabulary:
return {}
if idx < 0 or idx >= len(tokenized_sentences):
return {}
sent_tokens = tokenized_sentences[idx]
counts = Counter(sent_tokens)
vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
return dict(sorted(vector.items(), key=lambda kv: (-kv[1], kv[0])))
# ---------- Gradio App ----------
SAMPLE_TEXT = """NLTK is a powerful library for text processing.
Text processing is essential for NLP tasks.
Bag of Words is a fundamental concept in NLP.
Tokenization splits sentences into words.
We can count word occurrences in text.
Word frequency vectors represent sentences numerically.
Vectorization helps in transforming text for machine learning.
Machine learning models can use BOW as input.
NLP tasks include classification and sentiment analysis.
Word frequency counts provide insight into text structure.
"""
with gr.Blocks(title="NLTK: Tokenize β†’ Bag of Words β†’ Sentence Vector") as demo:
gr.Markdown(
"""
# NLTK Mini-Workbench
Type/paste text or drop a **.txt** / **.docx** file.
**Pipeline**
1) Install NLTK (auto-checked at startup)
2) Tokenize sentences into words
3) Count word occurrences (Bag of Words)
4) Build a word-frequency vector for any selected sentence
**Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults).
> Tip: Legacy `.doc` files are not supportedβ€”please convert to `.docx`.
"""
)
with gr.Row():
text_in = gr.Textbox(
label="Input Text",
value=SAMPLE_TEXT,
lines=12,
placeholder="Paste text here, or upload a file instead...",
)
file_in = gr.File(
label="Or upload a file (.txt or .docx)",
file_types=[".txt", ".docx"],
type="filepath",
)
clean_opt = gr.Checkbox(
label="Stopword + lowercase + punctuation removal",
value=True,
info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
)
process_btn = gr.Button("Process", variant="primary")
# Hidden state to carry processed artifacts between events
st_sentences = gr.State([])
st_tokenized = gr.State([])
st_vocab = gr.State([])
with gr.Row():
sentence_dropdown = gr.Dropdown(
choices=[],
label="Select a sentence to vectorize",
interactive=True,
)
with gr.Tab("Tokenized Sentences"):
tokenized_out = gr.JSON(label="Tokens per sentence")
with gr.Tab("Bag of Words"):
bow_df = gr.Dataframe(
headers=["word", "count"],
label="Bag of Words (sorted by count desc)",
interactive=False,
)
with gr.Tab("Sentence Vector"):
vec_df = gr.Dataframe(
headers=["word", "count"],
label="Word-frequency vector for selected sentence",
interactive=False,
)
status_md = gr.Markdown("", label="Status / Errors")
# --------- Events ---------
def on_process(text, file, clean):
try:
_ensure_nltk()
raw_text, read_err = read_text_input(text, file)
if read_err:
return (
gr.update(choices=[], value=None),
{},
[],
[],
[],
[],
[],
f"**Status:** {read_err}",
)
sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
dd_value = dd_choices[0] if dd_choices else None
tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
vector_rows = []
if tokenized_sentences and vocab:
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
vector_rows = [[w, c] for w, c in vec_map.items()]
status = f"βœ… Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
return (
gr.update(choices=dd_choices, value=dd_value),
tokenized_json,
[[w, c] for w, c in bow_rows],
vector_rows,
sentences,
tokenized_sentences,
vocab,
status,
)
except LookupError as e:
return (
gr.update(choices=[], value=None),
{},
[],
[],
[],
[],
[],
f"❌ NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```",
)
except Exception as e:
return (
gr.update(choices=[], value=None),
{},
[],
[],
[],
[],
[],
f"❌ Unexpected error: {type(e).__name__}: {e}",
)
process_btn.click(
fn=on_process,
inputs=[text_in, file_in, clean_opt],
outputs=[
sentence_dropdown, # dropdown choices + value
tokenized_out, # JSON tokens
bow_df, # BOW table
vec_df, # initial vector table
st_sentences, # state: sentences
st_tokenized, # state: tokenized sentences
st_vocab, # state: vocabulary
status_md, # status/errors
],
)
def on_select_sentence(choice: str, tokenized_sentences, vocabulary):
if not choice or not tokenized_sentences or not vocabulary:
return []
try:
idx = int(choice.split(":")[0]) - 1
except Exception:
return []
vec_map = build_sentence_vector(tokenized_sentences, vocabulary, idx)
return [[w, c] for w, c in vec_map.items()]
sentence_dropdown.change(
fn=on_select_sentence,
inputs=[sentence_dropdown, st_tokenized, st_vocab],
outputs=[vec_df],
)
if __name__ == "__main__":
demo.launch()