Spaces:
Running
Running
import gradio as gr | |
import advertools as adv | |
import pandas as pd | |
import re | |
from secrets import token_hex | |
import logging | |
import os | |
from markitdown import MarkItDown | |
from typing import Tuple, List, Optional | |
import validators | |
# Set up logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger(__name__) | |
# Initialize MarkItDown | |
md_converter = MarkItDown() | |
def validate_url(url: str) -> Tuple[bool, str]: | |
"""Validate URL format and accessibility.""" | |
if not url: | |
return False, "URL is required" | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
if not validators.url(url): | |
return False, "Invalid URL format" | |
return True, url | |
def safe_crawl(url: str, output_file: str) -> bool: | |
"""Safely perform a web crawl with timeout and error handling.""" | |
try: | |
adv.crawl( | |
url, | |
output_file, | |
follow_links=False, | |
custom_settings={ | |
'CLOSESPIDER_TIMEOUT': 30, | |
'ROBOTSTXT_OBEY': True, | |
'CONCURRENT_REQUESTS_PER_DOMAIN': 1, | |
'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)', | |
'DOWNLOAD_TIMEOUT': 10 | |
} | |
) | |
return True | |
except Exception as e: | |
logger.error(f"Crawl error for {url}: {str(e)}") | |
return False | |
def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str: | |
"""Process links based on selected types.""" | |
try: | |
all_links = [] | |
if "All links" in link_types or not link_types: | |
link_df = adv.crawlytics.links(crawl_df) | |
for link, text in link_df[['link', 'text']].dropna().values: | |
if text := text.strip(): | |
text = re.sub(r'[\n\s]+', ' ', text) | |
all_links.append(f"## {text}\n[{text}]({link})") | |
else: | |
for link_type in link_types: | |
type_match = re.findall(r"header|footer|nav", link_type.lower()) | |
if type_match: | |
col_prefix = type_match[0] | |
urls = crawl_df[f'{col_prefix}_links_url'].iloc[0] | |
texts = crawl_df[f'{col_prefix}_links_text'].iloc[0] | |
if urls and texts: | |
urls = urls.split('@@') | |
texts = texts.split('@@') | |
for url, text in zip(urls, texts): | |
if text := text.strip(): | |
text = re.sub(r'[\n\s]+', ' ', text) | |
all_links.append(f"## {text}\n[{text}]({url})") | |
return "\n\n".join(all_links) | |
except Exception as e: | |
logger.error(f"Link processing error: {str(e)}") | |
return "" | |
def process_url(url: str, link_types: List[str]) -> Tuple[str, str]: | |
"""Process website URL and generate markdown content.""" | |
valid, result = validate_url(url) | |
if not valid: | |
return "", result | |
url = result | |
output_file = f"crawl_{token_hex(6)}.jsonl" | |
try: | |
if not safe_crawl(url, output_file): | |
return "", "Crawl failed or timed out" | |
crawl_df = pd.read_json(output_file, lines=True) | |
if crawl_df.empty: | |
return "", "No data found for the URL" | |
# Extract title and description | |
title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled" | |
meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else "" | |
# Process links | |
links_content = process_links(crawl_df, link_types) | |
# Generate final markdown | |
content = f"# {title}\n\n" | |
if meta_desc: | |
content += f"> {meta_desc}\n\n" | |
content += links_content | |
return content, f"Successfully processed {url}" | |
except Exception as e: | |
logger.error(f"Error processing {url}: {str(e)}") | |
return "", f"Error: {str(e)}" | |
finally: | |
if os.path.exists(output_file): | |
os.remove(output_file) | |
def process_file(file: gr.File) -> Tuple[str, str]: | |
"""Convert uploaded file to markdown.""" | |
if not file: | |
return "", "No file uploaded" | |
supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'} | |
file_ext = os.path.splitext(file.name)[1].lower() | |
if file_ext not in supported_extensions: | |
return "", f"Unsupported file type: {file_ext}" | |
try: | |
result = md_converter.convert(file.name) | |
return result.text_content, "File processed successfully" | |
except Exception as e: | |
logger.error(f"File processing error: {str(e)}") | |
return "", f"Error processing file: {str(e)}" | |
# Custom CSS for styling | |
css = """ | |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap'); | |
body { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.primary-btn { | |
background-color: #3452db !important; | |
} | |
.primary-btn:hover { | |
background-color: #2a41af !important; | |
} | |
""" | |
# Create a custom theme | |
theme = gr.themes.Soft( | |
primary_hue=gr.themes.colors.Color( | |
name="blue", | |
c50="#eef1ff", | |
c100="#e0e5ff", | |
c200="#c3cbff", | |
c300="#a5b2ff", | |
c400="#8798ff", | |
c500="#6a7eff", | |
c600="#3452db", | |
c700="#2a41af", | |
c800="#1f3183", | |
c900="#152156", | |
c950="#0a102b", | |
) | |
) | |
# Create interface | |
with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface: | |
gr.Markdown("# LLMs.txt Generator") | |
with gr.Tab("Website URL"): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="example.com" | |
) | |
link_types = gr.Dropdown( | |
choices=["All links", "<header> links", "<nav> links", "<footer> links"], | |
multiselect=True, | |
value=["All links"], | |
label="Link Types to Extract" | |
) | |
url_button = gr.Button("Process URL", variant="primary") | |
url_output = gr.Textbox( | |
label="Generated Content", | |
lines=20, | |
show_copy_button=True | |
) | |
url_status = gr.Textbox(label="Status") | |
url_button.click( | |
process_url, | |
inputs=[url_input, link_types], | |
outputs=[url_output, url_status] | |
) | |
with gr.Tab("File Converter"): | |
file_input = gr.File(label="Upload Document") | |
file_button = gr.Button("Convert to Markdown", variant="primary") | |
file_output = gr.Textbox( | |
label="Converted Content", | |
lines=20, | |
show_copy_button=True | |
) | |
file_status = gr.Textbox(label="Status") | |
file_button.click( | |
process_file, | |
inputs=[file_input], | |
outputs=[file_output, file_status] | |
) | |
if __name__ == "__main__": | |
iface.launch() |