Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -199,6 +199,46 @@ async def fetch_codepen_project(codepen_url):
|
|
199 |
except Exception as e:
|
200 |
return f"Error: {e}", "", ""
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
# Create the Gradio interface
|
203 |
with gr.Blocks() as demo:
|
204 |
gr.HTML(copy_button_html) # Add the "Copy Code" script
|
@@ -326,5 +366,38 @@ with gr.Blocks() as demo:
|
|
326 |
outputs=[html_output, css_output, js_output]
|
327 |
)
|
328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
329 |
# Launch the interface
|
330 |
demo.launch()
|
|
|
199 |
except Exception as e:
|
200 |
return f"Error: {e}", "", ""
|
201 |
|
202 |
+
# Web Data Extractor
|
203 |
+
async def extract_web_data(url):
|
204 |
+
"""Extracts additional web data like description, image preview, colors, fonts, similar code, videos, and files."""
|
205 |
+
try:
|
206 |
+
response = await asyncio.to_thread(requests.get, url, timeout=5)
|
207 |
+
response.raise_for_status()
|
208 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
209 |
+
|
210 |
+
# Extract site description
|
211 |
+
description = soup.find("meta", attrs={"name": "description"})["content"] if soup.find("meta", attrs={"name": "description"}) else "No description available."
|
212 |
+
|
213 |
+
# Extract image preview (first image on the page)
|
214 |
+
image_preview = soup.find("img")["src"] if soup.find("img") else "No image preview available."
|
215 |
+
|
216 |
+
# Extract colors (from CSS or inline styles)
|
217 |
+
colors = []
|
218 |
+
for style in soup.find_all("style"):
|
219 |
+
colors.extend([color for color in style.text.split() if color.startswith("#")])
|
220 |
+
colors = list(set(colors))[:5] # Limit to 5 unique colors
|
221 |
+
|
222 |
+
# Extract fonts (from CSS or Google Fonts)
|
223 |
+
fonts = []
|
224 |
+
for link in soup.find_all("link", attrs={"href": True}):
|
225 |
+
if "fonts.googleapis.com" in link["href"]:
|
226 |
+
fonts.append(link["href"])
|
227 |
+
fonts = list(set(fonts))[:5] # Limit to 5 unique fonts
|
228 |
+
|
229 |
+
# Extract similar code (shorter version of the HTML)
|
230 |
+
similar_code = str(soup)[:1000] # Limit to first 1000 characters
|
231 |
+
|
232 |
+
# Extract videos (embedded iframes)
|
233 |
+
videos = [iframe["src"] for iframe in soup.find_all("iframe") if "src" in iframe.attrs]
|
234 |
+
|
235 |
+
# Extract files (links to downloadable files)
|
236 |
+
files = [a["href"] for a in soup.find_all("a", attrs={"href": True}) if a["href"].endswith((".pdf", ".zip", ".doc", ".docx", ".xls", ".xlsx"))]
|
237 |
+
|
238 |
+
return description, image_preview, colors, fonts, similar_code, videos, files
|
239 |
+
except Exception as e:
|
240 |
+
return f"Error: {e}", "", [], [], "", [], []
|
241 |
+
|
242 |
# Create the Gradio interface
|
243 |
with gr.Blocks() as demo:
|
244 |
gr.HTML(copy_button_html) # Add the "Copy Code" script
|
|
|
366 |
outputs=[html_output, css_output, js_output]
|
367 |
)
|
368 |
|
369 |
+
# Tab 5: Web Data Extractor
|
370 |
+
with gr.Tab("Web Data Extractor"):
|
371 |
+
gr.Markdown("## Web Data Extractor")
|
372 |
+
gr.Markdown("Enter a URL to extract additional web data like description, image preview, colors, fonts, similar code, videos, and files.")
|
373 |
+
|
374 |
+
with gr.Row():
|
375 |
+
web_data_url_input = gr.Textbox(label="Enter URL", placeholder="https://example.com")
|
376 |
+
|
377 |
+
with gr.Row():
|
378 |
+
description_output = gr.Textbox(label="Site Description", interactive=False)
|
379 |
+
image_preview_output = gr.Image(label="Image Preview", interactive=False)
|
380 |
+
|
381 |
+
with gr.Row():
|
382 |
+
colors_output = gr.Textbox(label="Colors", interactive=False)
|
383 |
+
fonts_output = gr.Textbox(label="Fonts", interactive=False)
|
384 |
+
|
385 |
+
with gr.Row():
|
386 |
+
similar_code_output = gr.Textbox(label="Similar Code", interactive=True, elem_id="similar-code-output")
|
387 |
+
|
388 |
+
with gr.Row():
|
389 |
+
videos_output = gr.Textbox(label="Videos", interactive=False)
|
390 |
+
files_output = gr.Textbox(label="Files", interactive=False)
|
391 |
+
|
392 |
+
with gr.Row():
|
393 |
+
gr.HTML("<button onclick='copyCode(\"similar-code-output\")'>Copy Code</button>") # Add the "Copy Code" button
|
394 |
+
|
395 |
+
submit_web_data_button = gr.Button("Extract Web Data")
|
396 |
+
submit_web_data_button.click(
|
397 |
+
fn=extract_web_data,
|
398 |
+
inputs=[web_data_url_input],
|
399 |
+
outputs=[description_output, image_preview_output, colors_output, fonts_output, similar_code_output, videos_output, files_output]
|
400 |
+
)
|
401 |
+
|
402 |
# Launch the interface
|
403 |
demo.launch()
|