Spaces:

eaglelandsonce
/

BOW_Workflow

Sleeping

App Files Files Community

BOW_Workflow / app.py

eaglelandsonce

Update app.py

6d1e3da verified 26 days ago

raw

history blame contribute delete

10.3 kB

	import os
	import string
	from collections import Counter
	from typing import List, Tuple, Dict

	import gradio as gr
	import nltk

	# ---------- NLTK bootstrap ----------
	def _ensure_nltk():
	# NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
	try:
	nltk.data.find("tokenizers/punkt")
	except LookupError:
	nltk.download("punkt", quiet=True)
	try:
	nltk.data.find("tokenizers/punkt_tab")
	except LookupError:
	try:
	nltk.download("punkt_tab", quiet=True)
	except Exception:
	pass # older NLTK doesn't have punkt_tab
	try:
	nltk.data.find("corpora/stopwords")
	except LookupError:
	nltk.download("stopwords", quiet=True)

	_ensure_nltk()

	from nltk.tokenize import sent_tokenize, word_tokenize
	from nltk.corpus import stopwords

	# ---------- Helpers ----------
	def read_text_input(text: str, file_obj) -> Tuple[str, str]:
	"""
	Priority: if a file is provided, read it; otherwise use text box.
	Supports .txt and .docx (not legacy .doc).
	Returns (content, error_message). If error_message != "", content may be empty.
	"""
	if file_obj:
	path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj))
	ext = os.path.splitext(path)[1].lower()
	if ext == ".txt":
	try:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read(), ""
	except Exception as e:
	return "", f"❌ Error reading .txt: {e}"
	elif ext == ".docx":
	try:
	from docx import Document
	except Exception as e:
	return "", f"❌ python-docx import failed: {e}. Did you install requirements?"
	try:
	doc = Document(path)
	return "\n".join(p.text for p in doc.paragraphs), ""
	except Exception as e:
	return "", f"❌ Error reading .docx: {e}"
	else:
	return "", "❌ Unsupported file type. Please upload .txt or .docx (not legacy .doc)."
	return (text or "", "")


	def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
	"""
	Clean mode:
	- lowercase
	- remove English stopwords
	- remove punctuation tokens (.,?!;:"'()[]{}- etc.)
	Raw mode (clean=False):
	- return tokens unchanged
	"""
	if not clean:
	return tokens
	stops = set(stopwords.words("english"))
	punct = set(string.punctuation)
	return [
	t.lower()
	for t in tokens
	if t not in punct and t.lower() not in stops
	]


	def tokenize_pipeline(
	raw_text: str, clean: bool
	) -> Tuple[List[str], List[List[str]], Counter, List[str]]:
	"""
	- Split text into sentences
	- Tokenize each sentence into words
	- (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
	- Build Bag of Words across the full text
	Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
	"""
	if not raw_text.strip():
	return [], [], Counter(), []

	sentences = sent_tokenize(raw_text)
	tokenized_sentences = []
	for s in sentences:
	tokens = word_tokenize(s)
	tokens = preprocess_tokens(tokens, clean=clean)
	tokenized_sentences.append(tokens)

	all_words = [w for sent in tokenized_sentences for w in sent]
	bow = Counter(all_words)
	vocabulary = sorted(bow.keys())
	return sentences, tokenized_sentences, bow, vocabulary


	def build_sentence_vector(
	tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
	) -> Dict[str, int]:
	"""
	Count occurrences of each vocab term inside the selected sentence.
	Returns {word: count} for non-zero entries, sorted by count desc then word.
	"""
	if not tokenized_sentences or not vocabulary:
	return {}
	if idx < 0 or idx >= len(tokenized_sentences):
	return {}
	sent_tokens = tokenized_sentences[idx]
	counts = Counter(sent_tokens)
	vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
	return dict(sorted(vector.items(), key=lambda kv: (-kv[1], kv[0])))


	# ---------- Gradio App ----------
	SAMPLE_TEXT = """NLTK is a powerful library for text processing.
	Text processing is essential for NLP tasks.
	Bag of Words is a fundamental concept in NLP.
	Tokenization splits sentences into words.
	We can count word occurrences in text.
	Word frequency vectors represent sentences numerically.
	Vectorization helps in transforming text for machine learning.
	Machine learning models can use BOW as input.
	NLP tasks include classification and sentiment analysis.
	Word frequency counts provide insight into text structure.
	"""

	with gr.Blocks(title="NLTK: Tokenize → Bag of Words → Sentence Vector") as demo:
	gr.Markdown(
	"""
	# NLTK Mini-Workbench
	Type/paste text or drop a .txt / .docx file.

	Pipeline
	1) Install NLTK (auto-checked at startup)
	2) Tokenize sentences into words
	3) Count word occurrences (Bag of Words)
	4) Build a word-frequency vector for any selected sentence

	Clean option: lowercasing + stopword removal + punctuation removal (like scikit-learn defaults).

	> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
	"""
	)

	with gr.Row():
	text_in = gr.Textbox(
	label="Input Text",
	value=SAMPLE_TEXT,
	lines=12,
	placeholder="Paste text here, or upload a file instead...",
	)
	file_in = gr.File(
	label="Or upload a file (.txt or .docx)",
	file_types=[".txt", ".docx"],
	type="filepath",
	)

	clean_opt = gr.Checkbox(
	label="Stopword + lowercase + punctuation removal",
	value=True,
	info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
	)

	process_btn = gr.Button("Process", variant="primary")

	# Hidden state to carry processed artifacts between events
	st_sentences = gr.State([])
	st_tokenized = gr.State([])
	st_vocab = gr.State([])

	with gr.Row():
	sentence_dropdown = gr.Dropdown(
	choices=[],
	label="Select a sentence to vectorize",
	interactive=True,
	)

	with gr.Tab("Tokenized Sentences"):
	tokenized_out = gr.JSON(label="Tokens per sentence")

	with gr.Tab("Bag of Words"):
	bow_df = gr.Dataframe(
	headers=["word", "count"],
	label="Bag of Words (sorted by count desc)",
	interactive=False,
	)

	with gr.Tab("Sentence Vector"):
	vec_df = gr.Dataframe(
	headers=["word", "count"],
	label="Word-frequency vector for selected sentence",
	interactive=False,
	)

	status_md = gr.Markdown("", label="Status / Errors")

	# --------- Events ---------
	def on_process(text, file, clean):
	try:
	_ensure_nltk()
	raw_text, read_err = read_text_input(text, file)
	if read_err:
	return (
	gr.update(choices=[], value=None),
	{},
	[],
	[],
	[],
	[],
	[],
	f"Status: {read_err}",
	)

	sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)

	dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
	dd_value = dd_choices[0] if dd_choices else None

	tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
	bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))

	vector_rows = []
	if tokenized_sentences and vocab:
	vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
	vector_rows = [[w, c] for w, c in vec_map.items()]

	status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
	return (
	gr.update(choices=dd_choices, value=dd_value),
	tokenized_json,
	[[w, c] for w, c in bow_rows],
	vector_rows,
	sentences,
	tokenized_sentences,
	vocab,
	status,
	)
	except LookupError as e:
	return (
	gr.update(choices=[], value=None),
	{},
	[],
	[],
	[],
	[],
	[],
	f"❌ NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```",
	)
	except Exception as e:
	return (
	gr.update(choices=[], value=None),
	{},
	[],
	[],
	[],
	[],
	[],
	f"❌ Unexpected error: {type(e).__name__}: {e}",
	)

	process_btn.click(
	fn=on_process,
	inputs=[text_in, file_in, clean_opt],
	outputs=[
	sentence_dropdown, # dropdown choices + value
	tokenized_out, # JSON tokens
	bow_df, # BOW table
	vec_df, # initial vector table
	st_sentences, # state: sentences
	st_tokenized, # state: tokenized sentences
	st_vocab, # state: vocabulary
	status_md, # status/errors
	],
	)

	def on_select_sentence(choice: str, tokenized_sentences, vocabulary):
	if not choice or not tokenized_sentences or not vocabulary:
	return []
	try:
	idx = int(choice.split(":")[0]) - 1
	except Exception:
	return []
	vec_map = build_sentence_vector(tokenized_sentences, vocabulary, idx)
	return [[w, c] for w, c in vec_map.items()]

	sentence_dropdown.change(
	fn=on_select_sentence,
	inputs=[sentence_dropdown, st_tokenized, st_vocab],
	outputs=[vec_df],
	)

	if __name__ == "__main__":
	demo.launch()