eaglelandsonce commited on
Commit
6d1e3da
·
verified ·
1 Parent(s): 3dd5cd9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -9
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  from collections import Counter
3
  from typing import List, Tuple, Dict
4
 
@@ -7,7 +8,7 @@ import nltk
7
 
8
  # ---------- NLTK bootstrap ----------
9
  def _ensure_nltk():
10
- # NLTK 3.9+ needs both 'punkt' and 'punkt_tab'
11
  try:
12
  nltk.data.find("tokenizers/punkt")
13
  except LookupError:
@@ -18,7 +19,7 @@ def _ensure_nltk():
18
  try:
19
  nltk.download("punkt_tab", quiet=True)
20
  except Exception:
21
- pass # old NLTK won't have punkt_tab; 'punkt' is enough there
22
  try:
23
  nltk.data.find("corpora/stopwords")
24
  except LookupError:
@@ -61,10 +62,23 @@ def read_text_input(text: str, file_obj) -> Tuple[str, str]:
61
 
62
 
63
  def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
 
 
 
 
 
 
 
 
64
  if not clean:
65
  return tokens
66
  stops = set(stopwords.words("english"))
67
- return [t.lower() for t in tokens if t.lower() not in stops]
 
 
 
 
 
68
 
69
 
70
  def tokenize_pipeline(
@@ -73,7 +87,7 @@ def tokenize_pipeline(
73
  """
74
  - Split text into sentences
75
  - Tokenize each sentence into words
76
- - (Optionally) lower + remove stopwords
77
  - Build Bag of Words across the full text
78
  Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
79
  """
@@ -96,6 +110,10 @@ def tokenize_pipeline(
96
  def build_sentence_vector(
97
  tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
98
  ) -> Dict[str, int]:
 
 
 
 
99
  if not tokenized_sentences or not vocabulary:
100
  return {}
101
  if idx < 0 or idx >= len(tokenized_sentences):
@@ -131,7 +149,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
131
  3) Count word occurrences (Bag of Words)
132
  4) Build a word-frequency vector for any selected sentence
133
 
134
- **Option:** Toggle *Stopword removal + lowercasing* for a cleaner Bag of Words.
135
 
136
  > Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
137
  """
@@ -151,9 +169,9 @@ Type/paste text or drop a **.txt** / **.docx** file.
151
  )
152
 
153
  clean_opt = gr.Checkbox(
154
- label="Stopword removal + lowercasing",
155
  value=True,
156
- info='Removes common English stopwords (e.g., "is", "for", "the") and lowercases tokens.',
157
  )
158
 
159
  process_btn = gr.Button("Process", variant="primary")
@@ -219,7 +237,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
219
  vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
220
  vector_rows = [[w, c] for w, c in vec_map.items()]
221
 
222
- status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
223
  return (
224
  gr.update(choices=dd_choices, value=dd_value),
225
  tokenized_json,
@@ -231,7 +249,6 @@ Type/paste text or drop a **.txt** / **.docx** file.
231
  status,
232
  )
233
  except LookupError as e:
234
- # Common NLTK resource errors (e.g., punkt_tab)
235
  return (
236
  gr.update(choices=[], value=None),
237
  {},
@@ -287,3 +304,4 @@ Type/paste text or drop a **.txt** / **.docx** file.
287
 
288
  if __name__ == "__main__":
289
  demo.launch()
 
 
1
  import os
2
+ import string
3
  from collections import Counter
4
  from typing import List, Tuple, Dict
5
 
 
8
 
9
  # ---------- NLTK bootstrap ----------
10
  def _ensure_nltk():
11
+ # NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
12
  try:
13
  nltk.data.find("tokenizers/punkt")
14
  except LookupError:
 
19
  try:
20
  nltk.download("punkt_tab", quiet=True)
21
  except Exception:
22
+ pass # older NLTK doesn't have punkt_tab
23
  try:
24
  nltk.data.find("corpora/stopwords")
25
  except LookupError:
 
62
 
63
 
64
  def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
65
+ """
66
+ Clean mode:
67
+ - lowercase
68
+ - remove English stopwords
69
+ - remove punctuation tokens (.,?!;:"'()[]{}- etc.)
70
+ Raw mode (clean=False):
71
+ - return tokens unchanged
72
+ """
73
  if not clean:
74
  return tokens
75
  stops = set(stopwords.words("english"))
76
+ punct = set(string.punctuation)
77
+ return [
78
+ t.lower()
79
+ for t in tokens
80
+ if t not in punct and t.lower() not in stops
81
+ ]
82
 
83
 
84
  def tokenize_pipeline(
 
87
  """
88
  - Split text into sentences
89
  - Tokenize each sentence into words
90
+ - (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
91
  - Build Bag of Words across the full text
92
  Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
93
  """
 
110
  def build_sentence_vector(
111
  tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
112
  ) -> Dict[str, int]:
113
+ """
114
+ Count occurrences of each vocab term inside the selected sentence.
115
+ Returns {word: count} for non-zero entries, sorted by count desc then word.
116
+ """
117
  if not tokenized_sentences or not vocabulary:
118
  return {}
119
  if idx < 0 or idx >= len(tokenized_sentences):
 
149
  3) Count word occurrences (Bag of Words)
150
  4) Build a word-frequency vector for any selected sentence
151
 
152
+ **Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults).
153
 
154
  > Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
155
  """
 
169
  )
170
 
171
  clean_opt = gr.Checkbox(
172
+ label="Stopword + lowercase + punctuation removal",
173
  value=True,
174
+ info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
175
  )
176
 
177
  process_btn = gr.Button("Process", variant="primary")
 
237
  vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
238
  vector_rows = [[w, c] for w, c in vec_map.items()]
239
 
240
+ status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
241
  return (
242
  gr.update(choices=dd_choices, value=dd_value),
243
  tokenized_json,
 
249
  status,
250
  )
251
  except LookupError as e:
 
252
  return (
253
  gr.update(choices=[], value=None),
254
  {},
 
304
 
305
  if __name__ == "__main__":
306
  demo.launch()
307
+