eaglelandsonce commited on
Commit
3dd5cd9
·
verified ·
1 Parent(s): b041b2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -65
app.py CHANGED
@@ -5,12 +5,20 @@ from typing import List, Tuple, Dict
5
  import gradio as gr
6
  import nltk
7
 
8
- # Ensure NLTK resources are available at startup (safe to call repeatedly)
9
  def _ensure_nltk():
 
10
  try:
11
  nltk.data.find("tokenizers/punkt")
12
  except LookupError:
13
  nltk.download("punkt", quiet=True)
 
 
 
 
 
 
 
14
  try:
15
  nltk.data.find("corpora/stopwords")
16
  except LookupError:
@@ -22,39 +30,37 @@ from nltk.tokenize import sent_tokenize, word_tokenize
22
  from nltk.corpus import stopwords
23
 
24
  # ---------- Helpers ----------
25
-
26
- def read_text_input(text: str, file_obj) -> str:
27
  """
28
  Priority: if a file is provided, read it; otherwise use text box.
29
  Supports .txt and .docx (not legacy .doc).
 
30
  """
31
- if file_obj is not None:
32
- path = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
33
  ext = os.path.splitext(path)[1].lower()
34
  if ext == ".txt":
35
- with open(path, "r", encoding="utf-8", errors="ignore") as f:
36
- return f.read()
 
 
 
37
  elif ext == ".docx":
38
  try:
39
  from docx import Document
40
  except Exception as e:
41
- return f"ERROR: python-docx not installed or failed to import: {e}"
42
  try:
43
  doc = Document(path)
44
- return "\n".join(p.text for p in doc.paragraphs)
45
  except Exception as e:
46
- return f"ERROR reading .docx: {e}"
47
  else:
48
- return "ERROR: Unsupported file type. Please upload .txt or .docx."
49
- return text or ""
50
 
51
 
52
  def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
53
- """
54
- Optionally lowercases and removes English stopwords.
55
- Leaves punctuation/nums as-is (tokenizer keeps them); the Bag of Words
56
- will reflect exactly what remains after stopword filtering.
57
- """
58
  if not clean:
59
  return tokens
60
  stops = set(stopwords.words("english"))
@@ -90,16 +96,10 @@ def tokenize_pipeline(
90
  def build_sentence_vector(
91
  tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
92
  ) -> Dict[str, int]:
93
- """
94
- Count occurrences of each vocab term inside the selected sentence.
95
- Returns a {word: count} mapping (only non-zero entries for clarity).
96
- """
97
  if not tokenized_sentences or not vocabulary:
98
  return {}
99
-
100
  if idx < 0 or idx >= len(tokenized_sentences):
101
  return {}
102
-
103
  sent_tokens = tokenized_sentences[idx]
104
  counts = Counter(sent_tokens)
105
  vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
@@ -107,7 +107,6 @@ def build_sentence_vector(
107
 
108
 
109
  # ---------- Gradio App ----------
110
-
111
  SAMPLE_TEXT = """NLTK is a powerful library for text processing.
112
  Text processing is essential for NLP tasks.
113
  Bag of Words is a fundamental concept in NLP.
@@ -124,15 +123,17 @@ with gr.Blocks(title="NLTK: Tokenize → Bag of Words → Sentence Vector") as d
124
  gr.Markdown(
125
  """
126
  # NLTK Mini-Workbench
127
- Type/paste text or drop a **.txt** / **.docx** file.
128
- Then click **Process** to:
 
129
  1) Install NLTK (auto-checked at startup)
130
  2) Tokenize sentences into words
131
  3) Count word occurrences (Bag of Words)
132
  4) Build a word-frequency vector for any selected sentence
133
 
134
- **Option:** Toggle *Stopword removal + lowercasing* to get a cleaner Bag of Words.
135
- > Note: Legacy `.doc` files are not supported—please convert to `.docx`.
 
136
  """
137
  )
138
 
@@ -177,7 +178,6 @@ Then click **Process** to:
177
  headers=["word", "count"],
178
  label="Bag of Words (sorted by count desc)",
179
  interactive=False,
180
- wrap=True,
181
  )
182
 
183
  with gr.Tab("Sentence Vector"):
@@ -185,52 +185,74 @@ Then click **Process** to:
185
  headers=["word", "count"],
186
  label="Word-frequency vector for selected sentence",
187
  interactive=False,
188
- wrap=True,
189
  )
190
 
191
- # --------- Events ---------
192
 
 
193
  def on_process(text, file, clean):
194
- # Ensure required NLTK bits exist (esp. for fresh environments)
195
- _ensure_nltk()
196
-
197
- raw_text = read_text_input(text, file)
198
- # If read_text_input returned an error string, pass it through gracefully
199
- if raw_text.startswith("ERROR"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  return (
201
  gr.update(choices=[], value=None),
 
202
  [],
203
  [],
204
  [],
205
  [],
206
  [],
 
 
 
 
 
 
 
 
 
 
 
 
207
  )
208
-
209
- sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
210
-
211
- # Prepare UI artifacts
212
- # Sentence dropdown: "1: <first 60 chars>"
213
- dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
214
- dd_value = dd_choices[0] if dd_choices else None
215
-
216
- tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
217
- bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
218
-
219
- # Build initial vector for sentence 1 if available
220
- vector_rows = []
221
- if tokenized_sentences and vocab:
222
- vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
223
- vector_rows = [[w, c] for w, c in vec_map.items()]
224
-
225
- return (
226
- gr.update(choices=dd_choices, value=dd_value),
227
- tokenized_json,
228
- [[w, c] for w, c in bow_rows],
229
- vector_rows,
230
- sentences,
231
- tokenized_sentences,
232
- vocab,
233
- )
234
 
235
  process_btn.click(
236
  fn=on_process,
@@ -243,6 +265,7 @@ Then click **Process** to:
243
  st_sentences, # state: sentences
244
  st_tokenized, # state: tokenized sentences
245
  st_vocab, # state: vocabulary
 
246
  ],
247
  )
248
 
@@ -250,7 +273,6 @@ Then click **Process** to:
250
  if not choice or not tokenized_sentences or not vocabulary:
251
  return []
252
  try:
253
- # Choice looks like "3: <preview>"
254
  idx = int(choice.split(":")[0]) - 1
255
  except Exception:
256
  return []
@@ -264,5 +286,4 @@ Then click **Process** to:
264
  )
265
 
266
  if __name__ == "__main__":
267
- # Launch on http://127.0.0.1:7860
268
  demo.launch()
 
5
  import gradio as gr
6
  import nltk
7
 
8
+ # ---------- NLTK bootstrap ----------
9
  def _ensure_nltk():
10
+ # NLTK 3.9+ needs both 'punkt' and 'punkt_tab'
11
  try:
12
  nltk.data.find("tokenizers/punkt")
13
  except LookupError:
14
  nltk.download("punkt", quiet=True)
15
+ try:
16
+ nltk.data.find("tokenizers/punkt_tab")
17
+ except LookupError:
18
+ try:
19
+ nltk.download("punkt_tab", quiet=True)
20
+ except Exception:
21
+ pass # old NLTK won't have punkt_tab; 'punkt' is enough there
22
  try:
23
  nltk.data.find("corpora/stopwords")
24
  except LookupError:
 
30
  from nltk.corpus import stopwords
31
 
32
  # ---------- Helpers ----------
33
+ def read_text_input(text: str, file_obj) -> Tuple[str, str]:
 
34
  """
35
  Priority: if a file is provided, read it; otherwise use text box.
36
  Supports .txt and .docx (not legacy .doc).
37
+ Returns (content, error_message). If error_message != "", content may be empty.
38
  """
39
+ if file_obj:
40
+ path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", str(file_obj))
41
  ext = os.path.splitext(path)[1].lower()
42
  if ext == ".txt":
43
+ try:
44
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
45
+ return f.read(), ""
46
+ except Exception as e:
47
+ return "", f"❌ Error reading .txt: {e}"
48
  elif ext == ".docx":
49
  try:
50
  from docx import Document
51
  except Exception as e:
52
+ return "", f" python-docx import failed: {e}. Did you install requirements?"
53
  try:
54
  doc = Document(path)
55
+ return "\n".join(p.text for p in doc.paragraphs), ""
56
  except Exception as e:
57
+ return "", f" Error reading .docx: {e}"
58
  else:
59
+ return "", "❌ Unsupported file type. Please upload .txt or .docx (not legacy .doc)."
60
+ return (text or "", "")
61
 
62
 
63
  def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
 
 
 
 
 
64
  if not clean:
65
  return tokens
66
  stops = set(stopwords.words("english"))
 
96
  def build_sentence_vector(
97
  tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
98
  ) -> Dict[str, int]:
 
 
 
 
99
  if not tokenized_sentences or not vocabulary:
100
  return {}
 
101
  if idx < 0 or idx >= len(tokenized_sentences):
102
  return {}
 
103
  sent_tokens = tokenized_sentences[idx]
104
  counts = Counter(sent_tokens)
105
  vector = {word: counts[word] for word in vocabulary if counts[word] > 0}
 
107
 
108
 
109
  # ---------- Gradio App ----------
 
110
  SAMPLE_TEXT = """NLTK is a powerful library for text processing.
111
  Text processing is essential for NLP tasks.
112
  Bag of Words is a fundamental concept in NLP.
 
123
  gr.Markdown(
124
  """
125
  # NLTK Mini-Workbench
126
+ Type/paste text or drop a **.txt** / **.docx** file.
127
+
128
+ **Pipeline**
129
  1) Install NLTK (auto-checked at startup)
130
  2) Tokenize sentences into words
131
  3) Count word occurrences (Bag of Words)
132
  4) Build a word-frequency vector for any selected sentence
133
 
134
+ **Option:** Toggle *Stopword removal + lowercasing* for a cleaner Bag of Words.
135
+
136
+ > Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
137
  """
138
  )
139
 
 
178
  headers=["word", "count"],
179
  label="Bag of Words (sorted by count desc)",
180
  interactive=False,
 
181
  )
182
 
183
  with gr.Tab("Sentence Vector"):
 
185
  headers=["word", "count"],
186
  label="Word-frequency vector for selected sentence",
187
  interactive=False,
 
188
  )
189
 
190
+ status_md = gr.Markdown("", label="Status / Errors")
191
 
192
+ # --------- Events ---------
193
  def on_process(text, file, clean):
194
+ try:
195
+ _ensure_nltk()
196
+ raw_text, read_err = read_text_input(text, file)
197
+ if read_err:
198
+ return (
199
+ gr.update(choices=[], value=None),
200
+ {},
201
+ [],
202
+ [],
203
+ [],
204
+ [],
205
+ [],
206
+ f"**Status:** {read_err}",
207
+ )
208
+
209
+ sentences, tokenized_sentences, bow, vocab = tokenize_pipeline(raw_text, clean)
210
+
211
+ dd_choices = [f"{i+1}: {s[:60].strip()}{'...' if len(s) > 60 else ''}" for i, s in enumerate(sentences)]
212
+ dd_value = dd_choices[0] if dd_choices else None
213
+
214
+ tokenized_json = {f"Sentence {i+1}": tokens for i, tokens in enumerate(tokenized_sentences)}
215
+ bow_rows = sorted(bow.items(), key=lambda kv: (-kv[1], kv[0]))
216
+
217
+ vector_rows = []
218
+ if tokenized_sentences and vocab:
219
+ vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
220
+ vector_rows = [[w, c] for w, c in vec_map.items()]
221
+
222
+ status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
223
+ return (
224
+ gr.update(choices=dd_choices, value=dd_value),
225
+ tokenized_json,
226
+ [[w, c] for w, c in bow_rows],
227
+ vector_rows,
228
+ sentences,
229
+ tokenized_sentences,
230
+ vocab,
231
+ status,
232
+ )
233
+ except LookupError as e:
234
+ # Common NLTK resource errors (e.g., punkt_tab)
235
  return (
236
  gr.update(choices=[], value=None),
237
+ {},
238
  [],
239
  [],
240
  [],
241
  [],
242
  [],
243
+ f"❌ NLTK resource error: {e}\n\nTry running:\n\n```\npython -m nltk.downloader punkt punkt_tab stopwords\n```",
244
+ )
245
+ except Exception as e:
246
+ return (
247
+ gr.update(choices=[], value=None),
248
+ {},
249
+ [],
250
+ [],
251
+ [],
252
+ [],
253
+ [],
254
+ f"❌ Unexpected error: {type(e).__name__}: {e}",
255
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
  process_btn.click(
258
  fn=on_process,
 
265
  st_sentences, # state: sentences
266
  st_tokenized, # state: tokenized sentences
267
  st_vocab, # state: vocabulary
268
+ status_md, # status/errors
269
  ],
270
  )
271
 
 
273
  if not choice or not tokenized_sentences or not vocabulary:
274
  return []
275
  try:
 
276
  idx = int(choice.split(":")[0]) - 1
277
  except Exception:
278
  return []
 
286
  )
287
 
288
  if __name__ == "__main__":
 
289
  demo.launch()