eaglelandsonce commited on
Commit
447225a
·
verified ·
1 Parent(s): f70a0ba

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -0
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from typing import List, Tuple, Union
4
+
5
+ import gradio as gr
6
+
7
+ # --- NLTK setup --------------------------------------------------------------
8
+ import nltk
9
+
10
+ NLTK_PACKAGES = [
11
+ "punkt",
12
+ "stopwords",
13
+ "wordnet",
14
+ "omw-1.4",
15
+ "averaged_perceptron_tagger",
16
+ "maxent_ne_chunker",
17
+ "words",
18
+ ]
19
+
20
+ def ensure_nltk_resources() -> str:
21
+ """
22
+ Download any missing NLTK resources needed for all menu actions.
23
+ """
24
+ messages = []
25
+ for pkg in NLTK_PACKAGES:
26
+ try:
27
+ nltk.data.find(pkg if "/" in pkg else f"tokenizers/{pkg}")
28
+ # Some resources live in other directories; attempt a general find:
29
+ except LookupError:
30
+ try:
31
+ nltk.download(pkg, quiet=True)
32
+ messages.append(f"Downloaded: {pkg}")
33
+ except Exception as e:
34
+ messages.append(f"Failed {pkg}: {e}")
35
+
36
+ # Validate with specific paths to avoid false positives
37
+ # (If a resource is still missing, try a broader search)
38
+ # Not strictly necessary—nltk.download usually suffices.
39
+ return " | ".join(messages) if messages else "All required resources already present."
40
+
41
+ # Import after ensuring NLTK available (safe even if not downloaded yet)
42
+ from nltk.tokenize import word_tokenize
43
+ from nltk.corpus import stopwords
44
+ from nltk.stem import PorterStemmer, WordNetLemmatizer
45
+ from nltk import pos_tag
46
+ from nltk.chunk import ne_chunk
47
+
48
+
49
+ # --- Helpers ----------------------------------------------------------------
50
+ def read_file(upload) -> str:
51
+ """
52
+ Reads text from a gradio UploadedFile. Supports .txt and .docx.
53
+ """
54
+ if upload is None:
55
+ return ""
56
+ name = getattr(upload, "name", "") or ""
57
+ ext = os.path.splitext(name)[1].lower()
58
+
59
+ # bytes content is at upload.read()
60
+ content = upload.read()
61
+
62
+ if ext == ".txt":
63
+ try:
64
+ return content.decode("utf-8")
65
+ except UnicodeDecodeError:
66
+ # Fallback to latin-1 if user throws random encodings at us
67
+ return content.decode("latin1")
68
+
69
+ elif ext == ".docx":
70
+ # Parse DOCX from bytes
71
+ try:
72
+ import docx # python-docx
73
+ except ImportError:
74
+ return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
75
+ f = io.BytesIO(content)
76
+ doc = docx.Document(f)
77
+ return "\n".join(p.text for p in doc.paragraphs)
78
+
79
+ else:
80
+ return f"Unsupported file type: {ext}. Please upload .txt or .docx."
81
+
82
+
83
+ def extract_ner(ne_tree) -> List[Tuple[str, str]]:
84
+ """
85
+ Convert an nltk.tree.Tree from ne_chunk into (entity_text, label) pairs.
86
+ """
87
+ entities = []
88
+ for subtree in ne_tree:
89
+ if hasattr(subtree, "label"):
90
+ label = subtree.label()
91
+ text = " ".join(token for token, _ in subtree.leaves())
92
+ entities.append((text, label))
93
+ return entities
94
+
95
+
96
+ # --- Core processing ---------------------------------------------------------
97
+ def process_text(
98
+ raw_text: str,
99
+ steps: List[str]
100
+ ) -> str:
101
+ """
102
+ Run selected processing steps and return a markdown report.
103
+ """
104
+ if not raw_text or raw_text.strip() == "":
105
+ return "⚠️ No text provided."
106
+
107
+ report_lines = []
108
+ text = raw_text
109
+
110
+ # Ensure resources if user forgot to click "Install/Download"
111
+ try:
112
+ # Probe a couple of resources; if missing, auto-download silently
113
+ nltk.data.find("tokenizers/punkt")
114
+ except LookupError:
115
+ ensure_nltk_resources()
116
+
117
+ tokens = None
118
+ filtered_tokens = None
119
+ stemmed_tokens = None
120
+ lemmatized_tokens = None
121
+ pos_tags = None
122
+ ner_pairs = None
123
+
124
+ # 1) Tokenize
125
+ if "Tokenize text." in steps or any(s in steps for s in ["Remove stopwords.", "Stem words.", "Lemmatize words.", "Tag parts of speech.", "Extract named entities."]):
126
+ tokens = word_tokenize(text)
127
+ if "Tokenize text." in steps:
128
+ report_lines.append("### Tokens")
129
+ report_lines.append(f"`{tokens}`\n")
130
+
131
+ # 2) Stopwords
132
+ if "Remove stopwords." in steps:
133
+ sw = set(stopwords.words("english"))
134
+ filtered_tokens = [w for w in tokens if w.lower() not in sw]
135
+ report_lines.append("### After Stopword Removal")
136
+ report_lines.append(f"`{filtered_tokens}`\n")
137
+ else:
138
+ filtered_tokens = tokens
139
+
140
+ # 3) Stemming
141
+ if "Stem words." in steps:
142
+ stemmer = PorterStemmer()
143
+ stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
144
+ report_lines.append("### Stemmed Tokens (Porter)")
145
+ report_lines.append(f"`{stemmed_tokens}`\n")
146
+ else:
147
+ stemmed_tokens = filtered_tokens
148
+
149
+ # 4) Lemmatization
150
+ if "Lemmatize words." in steps:
151
+ lemmatizer = WordNetLemmatizer()
152
+ lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
153
+ report_lines.append("### Lemmatized Tokens (WordNet)")
154
+ report_lines.append(f"`{lemmatized_tokens}`\n")
155
+ else:
156
+ lemmatized_tokens = stemmed_tokens or filtered_tokens
157
+
158
+ # 5) POS Tagging
159
+ if "Tag parts of speech." in steps or "Extract named entities." in steps:
160
+ base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
161
+ pos_tags = pos_tag(base_for_tagging)
162
+ if "Tag parts of speech." in steps:
163
+ report_lines.append("### Part-of-Speech Tags")
164
+ # Pretty table
165
+ rows = ["| Token | POS |", "|---|---|"]
166
+ rows += [f"| {t} | {p} |" for (t, p) in pos_tags]
167
+ report_lines.append("\n".join(rows) + "\n")
168
+
169
+ # 6) NER
170
+ if "Extract named entities." in steps:
171
+ if not pos_tags:
172
+ pos_tags = pos_tag(lemmatized_tokens if lemmatized_tokens else (tokens or []))
173
+ ne_tree = ne_chunk(pos_tags, binary=False)
174
+ ner_pairs = extract_ner(ne_tree)
175
+
176
+ report_lines.append("### Named Entities")
177
+ if ner_pairs:
178
+ rows = ["| Entity | Label |", "|---|---|"]
179
+ rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
180
+ report_lines.append("\n".join(rows) + "\n")
181
+ else:
182
+ report_lines.append("_No named entities found._\n")
183
+
184
+ # Final markdown
185
+ return "\n".join(report_lines).strip() or "No steps selected."
186
+
187
+
188
+ # --- Gradio UI ---------------------------------------------------------------
189
+ MENU = [
190
+ "Install and download required resources.",
191
+ "Tokenize text.",
192
+ "Remove stopwords.",
193
+ "Stem words.",
194
+ "Lemmatize words.",
195
+ "Tag parts of speech.",
196
+ "Extract named entities.",
197
+ ]
198
+
199
+ DEFAULT_TEXT = (
200
+ "NLTK is a powerful library for text processing. "
201
+ "Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
202
+ )
203
+
204
+ with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
205
+ gr.Markdown("# NLTK Text Processing Toolkit")
206
+ gr.Markdown(
207
+ "Type or paste text, or drag a `.txt` / `.docx` file. "
208
+ "Select the steps to run, then click **Process**. "
209
+ "Use **Install/Download Resources** once if needed."
210
+ )
211
+
212
+ with gr.Row():
213
+ with gr.Column():
214
+ text_in = gr.Textbox(
215
+ label="Text Input",
216
+ lines=10,
217
+ value=DEFAULT_TEXT,
218
+ placeholder="Type or paste text here..."
219
+ )
220
+ file_in = gr.File(
221
+ label="...or drop a .txt / .docx file",
222
+ file_types=[".txt", ".docx"]
223
+ )
224
+ steps_in = gr.CheckboxGroup(
225
+ choices=MENU,
226
+ value=[
227
+ "Tokenize text.",
228
+ "Remove stopwords.",
229
+ "Lemmatize words.",
230
+ "Tag parts of speech.",
231
+ "Extract named entities.",
232
+ ],
233
+ label="Menu (choose one or more)"
234
+ )
235
+ with gr.Row():
236
+ install_btn = gr.Button("Install/Download Resources")
237
+ process_btn = gr.Button("Process", variant="primary")
238
+
239
+ with gr.Column():
240
+ status_out = gr.Textbox(label="Status", interactive=False)
241
+ result_out = gr.Markdown(label="Results")
242
+
243
+ # Button callbacks
244
+ def on_install():
245
+ return ensure_nltk_resources()
246
+
247
+ def on_process(text, file, steps):
248
+ # If user provided a file, prefer file content unless text has content
249
+ file_text = read_file(file) if file is not None else ""
250
+ # If file_text indicates an error, show it
251
+ if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
252
+ return file_text
253
+ final_text = text.strip() if (text and text.strip()) else file_text
254
+ return process_text(final_text, steps or [])
255
+
256
+ install_btn.click(fn=on_install, inputs=None, outputs=status_out)
257
+ process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
258
+
259
+
260
+ if __name__ == "__main__":
261
+ # You can customize server_name/port if deploying remotely
262
+ demo.launch()