Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
from typing import List, Tuple, Union
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
# --- NLTK setup --------------------------------------------------------------
|
8 |
+
import nltk
|
9 |
+
|
10 |
+
NLTK_PACKAGES = [
|
11 |
+
"punkt",
|
12 |
+
"stopwords",
|
13 |
+
"wordnet",
|
14 |
+
"omw-1.4",
|
15 |
+
"averaged_perceptron_tagger",
|
16 |
+
"maxent_ne_chunker",
|
17 |
+
"words",
|
18 |
+
]
|
19 |
+
|
20 |
+
def ensure_nltk_resources() -> str:
|
21 |
+
"""
|
22 |
+
Download any missing NLTK resources needed for all menu actions.
|
23 |
+
"""
|
24 |
+
messages = []
|
25 |
+
for pkg in NLTK_PACKAGES:
|
26 |
+
try:
|
27 |
+
nltk.data.find(pkg if "/" in pkg else f"tokenizers/{pkg}")
|
28 |
+
# Some resources live in other directories; attempt a general find:
|
29 |
+
except LookupError:
|
30 |
+
try:
|
31 |
+
nltk.download(pkg, quiet=True)
|
32 |
+
messages.append(f"Downloaded: {pkg}")
|
33 |
+
except Exception as e:
|
34 |
+
messages.append(f"Failed {pkg}: {e}")
|
35 |
+
|
36 |
+
# Validate with specific paths to avoid false positives
|
37 |
+
# (If a resource is still missing, try a broader search)
|
38 |
+
# Not strictly necessary—nltk.download usually suffices.
|
39 |
+
return " | ".join(messages) if messages else "All required resources already present."
|
40 |
+
|
41 |
+
# Import after ensuring NLTK available (safe even if not downloaded yet)
|
42 |
+
from nltk.tokenize import word_tokenize
|
43 |
+
from nltk.corpus import stopwords
|
44 |
+
from nltk.stem import PorterStemmer, WordNetLemmatizer
|
45 |
+
from nltk import pos_tag
|
46 |
+
from nltk.chunk import ne_chunk
|
47 |
+
|
48 |
+
|
49 |
+
# --- Helpers ----------------------------------------------------------------
|
50 |
+
def read_file(upload) -> str:
|
51 |
+
"""
|
52 |
+
Reads text from a gradio UploadedFile. Supports .txt and .docx.
|
53 |
+
"""
|
54 |
+
if upload is None:
|
55 |
+
return ""
|
56 |
+
name = getattr(upload, "name", "") or ""
|
57 |
+
ext = os.path.splitext(name)[1].lower()
|
58 |
+
|
59 |
+
# bytes content is at upload.read()
|
60 |
+
content = upload.read()
|
61 |
+
|
62 |
+
if ext == ".txt":
|
63 |
+
try:
|
64 |
+
return content.decode("utf-8")
|
65 |
+
except UnicodeDecodeError:
|
66 |
+
# Fallback to latin-1 if user throws random encodings at us
|
67 |
+
return content.decode("latin1")
|
68 |
+
|
69 |
+
elif ext == ".docx":
|
70 |
+
# Parse DOCX from bytes
|
71 |
+
try:
|
72 |
+
import docx # python-docx
|
73 |
+
except ImportError:
|
74 |
+
return "ERROR: python-docx not installed. Add 'python-docx' to requirements.txt."
|
75 |
+
f = io.BytesIO(content)
|
76 |
+
doc = docx.Document(f)
|
77 |
+
return "\n".join(p.text for p in doc.paragraphs)
|
78 |
+
|
79 |
+
else:
|
80 |
+
return f"Unsupported file type: {ext}. Please upload .txt or .docx."
|
81 |
+
|
82 |
+
|
83 |
+
def extract_ner(ne_tree) -> List[Tuple[str, str]]:
|
84 |
+
"""
|
85 |
+
Convert an nltk.tree.Tree from ne_chunk into (entity_text, label) pairs.
|
86 |
+
"""
|
87 |
+
entities = []
|
88 |
+
for subtree in ne_tree:
|
89 |
+
if hasattr(subtree, "label"):
|
90 |
+
label = subtree.label()
|
91 |
+
text = " ".join(token for token, _ in subtree.leaves())
|
92 |
+
entities.append((text, label))
|
93 |
+
return entities
|
94 |
+
|
95 |
+
|
96 |
+
# --- Core processing ---------------------------------------------------------
|
97 |
+
def process_text(
|
98 |
+
raw_text: str,
|
99 |
+
steps: List[str]
|
100 |
+
) -> str:
|
101 |
+
"""
|
102 |
+
Run selected processing steps and return a markdown report.
|
103 |
+
"""
|
104 |
+
if not raw_text or raw_text.strip() == "":
|
105 |
+
return "⚠️ No text provided."
|
106 |
+
|
107 |
+
report_lines = []
|
108 |
+
text = raw_text
|
109 |
+
|
110 |
+
# Ensure resources if user forgot to click "Install/Download"
|
111 |
+
try:
|
112 |
+
# Probe a couple of resources; if missing, auto-download silently
|
113 |
+
nltk.data.find("tokenizers/punkt")
|
114 |
+
except LookupError:
|
115 |
+
ensure_nltk_resources()
|
116 |
+
|
117 |
+
tokens = None
|
118 |
+
filtered_tokens = None
|
119 |
+
stemmed_tokens = None
|
120 |
+
lemmatized_tokens = None
|
121 |
+
pos_tags = None
|
122 |
+
ner_pairs = None
|
123 |
+
|
124 |
+
# 1) Tokenize
|
125 |
+
if "Tokenize text." in steps or any(s in steps for s in ["Remove stopwords.", "Stem words.", "Lemmatize words.", "Tag parts of speech.", "Extract named entities."]):
|
126 |
+
tokens = word_tokenize(text)
|
127 |
+
if "Tokenize text." in steps:
|
128 |
+
report_lines.append("### Tokens")
|
129 |
+
report_lines.append(f"`{tokens}`\n")
|
130 |
+
|
131 |
+
# 2) Stopwords
|
132 |
+
if "Remove stopwords." in steps:
|
133 |
+
sw = set(stopwords.words("english"))
|
134 |
+
filtered_tokens = [w for w in tokens if w.lower() not in sw]
|
135 |
+
report_lines.append("### After Stopword Removal")
|
136 |
+
report_lines.append(f"`{filtered_tokens}`\n")
|
137 |
+
else:
|
138 |
+
filtered_tokens = tokens
|
139 |
+
|
140 |
+
# 3) Stemming
|
141 |
+
if "Stem words." in steps:
|
142 |
+
stemmer = PorterStemmer()
|
143 |
+
stemmed_tokens = [stemmer.stem(w) for w in (filtered_tokens or [])]
|
144 |
+
report_lines.append("### Stemmed Tokens (Porter)")
|
145 |
+
report_lines.append(f"`{stemmed_tokens}`\n")
|
146 |
+
else:
|
147 |
+
stemmed_tokens = filtered_tokens
|
148 |
+
|
149 |
+
# 4) Lemmatization
|
150 |
+
if "Lemmatize words." in steps:
|
151 |
+
lemmatizer = WordNetLemmatizer()
|
152 |
+
lemmatized_tokens = [lemmatizer.lemmatize(w) for w in (filtered_tokens or [])]
|
153 |
+
report_lines.append("### Lemmatized Tokens (WordNet)")
|
154 |
+
report_lines.append(f"`{lemmatized_tokens}`\n")
|
155 |
+
else:
|
156 |
+
lemmatized_tokens = stemmed_tokens or filtered_tokens
|
157 |
+
|
158 |
+
# 5) POS Tagging
|
159 |
+
if "Tag parts of speech." in steps or "Extract named entities." in steps:
|
160 |
+
base_for_tagging = lemmatized_tokens if lemmatized_tokens is not None else (tokens or [])
|
161 |
+
pos_tags = pos_tag(base_for_tagging)
|
162 |
+
if "Tag parts of speech." in steps:
|
163 |
+
report_lines.append("### Part-of-Speech Tags")
|
164 |
+
# Pretty table
|
165 |
+
rows = ["| Token | POS |", "|---|---|"]
|
166 |
+
rows += [f"| {t} | {p} |" for (t, p) in pos_tags]
|
167 |
+
report_lines.append("\n".join(rows) + "\n")
|
168 |
+
|
169 |
+
# 6) NER
|
170 |
+
if "Extract named entities." in steps:
|
171 |
+
if not pos_tags:
|
172 |
+
pos_tags = pos_tag(lemmatized_tokens if lemmatized_tokens else (tokens or []))
|
173 |
+
ne_tree = ne_chunk(pos_tags, binary=False)
|
174 |
+
ner_pairs = extract_ner(ne_tree)
|
175 |
+
|
176 |
+
report_lines.append("### Named Entities")
|
177 |
+
if ner_pairs:
|
178 |
+
rows = ["| Entity | Label |", "|---|---|"]
|
179 |
+
rows += [f"| {ent} | {lbl} |" for (ent, lbl) in ner_pairs]
|
180 |
+
report_lines.append("\n".join(rows) + "\n")
|
181 |
+
else:
|
182 |
+
report_lines.append("_No named entities found._\n")
|
183 |
+
|
184 |
+
# Final markdown
|
185 |
+
return "\n".join(report_lines).strip() or "No steps selected."
|
186 |
+
|
187 |
+
|
188 |
+
# --- Gradio UI ---------------------------------------------------------------
|
189 |
+
MENU = [
|
190 |
+
"Install and download required resources.",
|
191 |
+
"Tokenize text.",
|
192 |
+
"Remove stopwords.",
|
193 |
+
"Stem words.",
|
194 |
+
"Lemmatize words.",
|
195 |
+
"Tag parts of speech.",
|
196 |
+
"Extract named entities.",
|
197 |
+
]
|
198 |
+
|
199 |
+
DEFAULT_TEXT = (
|
200 |
+
"NLTK is a powerful library for text processing. "
|
201 |
+
"Barack Obama served as the 44th President of the United States and lived in Washington, D.C."
|
202 |
+
)
|
203 |
+
|
204 |
+
with gr.Blocks(title="NLTK Text Processing Toolkit") as demo:
|
205 |
+
gr.Markdown("# NLTK Text Processing Toolkit")
|
206 |
+
gr.Markdown(
|
207 |
+
"Type or paste text, or drag a `.txt` / `.docx` file. "
|
208 |
+
"Select the steps to run, then click **Process**. "
|
209 |
+
"Use **Install/Download Resources** once if needed."
|
210 |
+
)
|
211 |
+
|
212 |
+
with gr.Row():
|
213 |
+
with gr.Column():
|
214 |
+
text_in = gr.Textbox(
|
215 |
+
label="Text Input",
|
216 |
+
lines=10,
|
217 |
+
value=DEFAULT_TEXT,
|
218 |
+
placeholder="Type or paste text here..."
|
219 |
+
)
|
220 |
+
file_in = gr.File(
|
221 |
+
label="...or drop a .txt / .docx file",
|
222 |
+
file_types=[".txt", ".docx"]
|
223 |
+
)
|
224 |
+
steps_in = gr.CheckboxGroup(
|
225 |
+
choices=MENU,
|
226 |
+
value=[
|
227 |
+
"Tokenize text.",
|
228 |
+
"Remove stopwords.",
|
229 |
+
"Lemmatize words.",
|
230 |
+
"Tag parts of speech.",
|
231 |
+
"Extract named entities.",
|
232 |
+
],
|
233 |
+
label="Menu (choose one or more)"
|
234 |
+
)
|
235 |
+
with gr.Row():
|
236 |
+
install_btn = gr.Button("Install/Download Resources")
|
237 |
+
process_btn = gr.Button("Process", variant="primary")
|
238 |
+
|
239 |
+
with gr.Column():
|
240 |
+
status_out = gr.Textbox(label="Status", interactive=False)
|
241 |
+
result_out = gr.Markdown(label="Results")
|
242 |
+
|
243 |
+
# Button callbacks
|
244 |
+
def on_install():
|
245 |
+
return ensure_nltk_resources()
|
246 |
+
|
247 |
+
def on_process(text, file, steps):
|
248 |
+
# If user provided a file, prefer file content unless text has content
|
249 |
+
file_text = read_file(file) if file is not None else ""
|
250 |
+
# If file_text indicates an error, show it
|
251 |
+
if file_text.startswith("ERROR:") or file_text.startswith("Unsupported file type:"):
|
252 |
+
return file_text
|
253 |
+
final_text = text.strip() if (text and text.strip()) else file_text
|
254 |
+
return process_text(final_text, steps or [])
|
255 |
+
|
256 |
+
install_btn.click(fn=on_install, inputs=None, outputs=status_out)
|
257 |
+
process_btn.click(fn=on_process, inputs=[text_in, file_in, steps_in], outputs=result_out)
|
258 |
+
|
259 |
+
|
260 |
+
if __name__ == "__main__":
|
261 |
+
# You can customize server_name/port if deploying remotely
|
262 |
+
demo.launch()
|