Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
from collections import Counter
|
3 |
from typing import List, Tuple, Dict
|
4 |
|
@@ -7,7 +8,7 @@ import nltk
|
|
7 |
|
8 |
# ---------- NLTK bootstrap ----------
|
9 |
def _ensure_nltk():
|
10 |
-
# NLTK 3.9+
|
11 |
try:
|
12 |
nltk.data.find("tokenizers/punkt")
|
13 |
except LookupError:
|
@@ -18,7 +19,7 @@ def _ensure_nltk():
|
|
18 |
try:
|
19 |
nltk.download("punkt_tab", quiet=True)
|
20 |
except Exception:
|
21 |
-
pass #
|
22 |
try:
|
23 |
nltk.data.find("corpora/stopwords")
|
24 |
except LookupError:
|
@@ -61,10 +62,23 @@ def read_text_input(text: str, file_obj) -> Tuple[str, str]:
|
|
61 |
|
62 |
|
63 |
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if not clean:
|
65 |
return tokens
|
66 |
stops = set(stopwords.words("english"))
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
68 |
|
69 |
|
70 |
def tokenize_pipeline(
|
@@ -73,7 +87,7 @@ def tokenize_pipeline(
|
|
73 |
"""
|
74 |
- Split text into sentences
|
75 |
- Tokenize each sentence into words
|
76 |
-
- (Optionally)
|
77 |
- Build Bag of Words across the full text
|
78 |
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
|
79 |
"""
|
@@ -96,6 +110,10 @@ def tokenize_pipeline(
|
|
96 |
def build_sentence_vector(
|
97 |
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
|
98 |
) -> Dict[str, int]:
|
|
|
|
|
|
|
|
|
99 |
if not tokenized_sentences or not vocabulary:
|
100 |
return {}
|
101 |
if idx < 0 or idx >= len(tokenized_sentences):
|
@@ -131,7 +149,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
131 |
3) Count word occurrences (Bag of Words)
|
132 |
4) Build a word-frequency vector for any selected sentence
|
133 |
|
134 |
-
**
|
135 |
|
136 |
> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
|
137 |
"""
|
@@ -151,9 +169,9 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
151 |
)
|
152 |
|
153 |
clean_opt = gr.Checkbox(
|
154 |
-
label="Stopword
|
155 |
value=True,
|
156 |
-
info='Removes common English stopwords (e.g., "
|
157 |
)
|
158 |
|
159 |
process_btn = gr.Button("Process", variant="primary")
|
@@ -219,7 +237,7 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
219 |
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
|
220 |
vector_rows = [[w, c] for w, c in vec_map.items()]
|
221 |
|
222 |
-
status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}."
|
223 |
return (
|
224 |
gr.update(choices=dd_choices, value=dd_value),
|
225 |
tokenized_json,
|
@@ -231,7 +249,6 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
231 |
status,
|
232 |
)
|
233 |
except LookupError as e:
|
234 |
-
# Common NLTK resource errors (e.g., punkt_tab)
|
235 |
return (
|
236 |
gr.update(choices=[], value=None),
|
237 |
{},
|
@@ -287,3 +304,4 @@ Type/paste text or drop a **.txt** / **.docx** file.
|
|
287 |
|
288 |
if __name__ == "__main__":
|
289 |
demo.launch()
|
|
|
|
1 |
import os
|
2 |
+
import string
|
3 |
from collections import Counter
|
4 |
from typing import List, Tuple, Dict
|
5 |
|
|
|
8 |
|
9 |
# ---------- NLTK bootstrap ----------
|
10 |
def _ensure_nltk():
|
11 |
+
# NLTK 3.9+ may require both 'punkt' and 'punkt_tab'
|
12 |
try:
|
13 |
nltk.data.find("tokenizers/punkt")
|
14 |
except LookupError:
|
|
|
19 |
try:
|
20 |
nltk.download("punkt_tab", quiet=True)
|
21 |
except Exception:
|
22 |
+
pass # older NLTK doesn't have punkt_tab
|
23 |
try:
|
24 |
nltk.data.find("corpora/stopwords")
|
25 |
except LookupError:
|
|
|
62 |
|
63 |
|
64 |
def preprocess_tokens(tokens: List[str], clean: bool) -> List[str]:
|
65 |
+
"""
|
66 |
+
Clean mode:
|
67 |
+
- lowercase
|
68 |
+
- remove English stopwords
|
69 |
+
- remove punctuation tokens (.,?!;:"'()[]{}- etc.)
|
70 |
+
Raw mode (clean=False):
|
71 |
+
- return tokens unchanged
|
72 |
+
"""
|
73 |
if not clean:
|
74 |
return tokens
|
75 |
stops = set(stopwords.words("english"))
|
76 |
+
punct = set(string.punctuation)
|
77 |
+
return [
|
78 |
+
t.lower()
|
79 |
+
for t in tokens
|
80 |
+
if t not in punct and t.lower() not in stops
|
81 |
+
]
|
82 |
|
83 |
|
84 |
def tokenize_pipeline(
|
|
|
87 |
"""
|
88 |
- Split text into sentences
|
89 |
- Tokenize each sentence into words
|
90 |
+
- (Optionally) apply cleaning (lowercase, stopwords, punctuation removal)
|
91 |
- Build Bag of Words across the full text
|
92 |
Returns: sentences, tokenized_sentences, bow_counter, vocabulary_list
|
93 |
"""
|
|
|
110 |
def build_sentence_vector(
|
111 |
tokenized_sentences: List[List[str]], vocabulary: List[str], idx: int
|
112 |
) -> Dict[str, int]:
|
113 |
+
"""
|
114 |
+
Count occurrences of each vocab term inside the selected sentence.
|
115 |
+
Returns {word: count} for non-zero entries, sorted by count desc then word.
|
116 |
+
"""
|
117 |
if not tokenized_sentences or not vocabulary:
|
118 |
return {}
|
119 |
if idx < 0 or idx >= len(tokenized_sentences):
|
|
|
149 |
3) Count word occurrences (Bag of Words)
|
150 |
4) Build a word-frequency vector for any selected sentence
|
151 |
|
152 |
+
**Clean option:** lowercasing + stopword removal **+ punctuation removal** (like scikit-learn defaults).
|
153 |
|
154 |
> Tip: Legacy `.doc` files are not supported—please convert to `.docx`.
|
155 |
"""
|
|
|
169 |
)
|
170 |
|
171 |
clean_opt = gr.Checkbox(
|
172 |
+
label="Stopword + lowercase + punctuation removal",
|
173 |
value=True,
|
174 |
+
info='Removes common English stopwords, lowercases tokens, and strips punctuation tokens (e.g., ".", ",", "!").',
|
175 |
)
|
176 |
|
177 |
process_btn = gr.Button("Process", variant="primary")
|
|
|
237 |
vec_map = build_sentence_vector(tokenized_sentences, vocab, 0)
|
238 |
vector_rows = [[w, c] for w, c in vec_map.items()]
|
239 |
|
240 |
+
status = f"✅ Processed {len(sentences)} sentence(s). Vocabulary size: {len(vocab)}. Clean={'ON' if clean else 'OFF'}."
|
241 |
return (
|
242 |
gr.update(choices=dd_choices, value=dd_value),
|
243 |
tokenized_json,
|
|
|
249 |
status,
|
250 |
)
|
251 |
except LookupError as e:
|
|
|
252 |
return (
|
253 |
gr.update(choices=[], value=None),
|
254 |
{},
|
|
|
304 |
|
305 |
if __name__ == "__main__":
|
306 |
demo.launch()
|
307 |
+
|