create-llms-txt / app.py
cyberandy's picture
Update app.py
0221da4 verified
import gradio as gr
import advertools as adv
import pandas as pd
import re
from secrets import token_hex
import logging
import os
from markitdown import MarkItDown
from typing import Tuple, List, Optional
import validators
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Initialize MarkItDown
md_converter = MarkItDown()
def validate_url(url: str) -> Tuple[bool, str]:
"""Validate URL format and accessibility."""
if not url:
return False, "URL is required"
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
if not validators.url(url):
return False, "Invalid URL format"
return True, url
def safe_crawl(url: str, output_file: str) -> bool:
"""Safely perform a web crawl with timeout and error handling."""
try:
adv.crawl(
url,
output_file,
follow_links=False,
custom_settings={
'CLOSESPIDER_TIMEOUT': 30,
'ROBOTSTXT_OBEY': True,
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)',
'DOWNLOAD_TIMEOUT': 10
}
)
return True
except Exception as e:
logger.error(f"Crawl error for {url}: {str(e)}")
return False
def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
"""Process links based on selected types."""
try:
all_links = []
if "All links" in link_types or not link_types:
link_df = adv.crawlytics.links(crawl_df)
for link, text in link_df[['link', 'text']].dropna().values:
if text := text.strip():
text = re.sub(r'[\n\s]+', ' ', text)
all_links.append(f"## {text}\n[{text}]({link})")
else:
for link_type in link_types:
type_match = re.findall(r"header|footer|nav", link_type.lower())
if type_match:
col_prefix = type_match[0]
urls = crawl_df[f'{col_prefix}_links_url'].iloc[0]
texts = crawl_df[f'{col_prefix}_links_text'].iloc[0]
if urls and texts:
urls = urls.split('@@')
texts = texts.split('@@')
for url, text in zip(urls, texts):
if text := text.strip():
text = re.sub(r'[\n\s]+', ' ', text)
all_links.append(f"## {text}\n[{text}]({url})")
return "\n\n".join(all_links)
except Exception as e:
logger.error(f"Link processing error: {str(e)}")
return ""
def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
"""Process website URL and generate markdown content."""
valid, result = validate_url(url)
if not valid:
return "", result
url = result
output_file = f"crawl_{token_hex(6)}.jsonl"
try:
if not safe_crawl(url, output_file):
return "", "Crawl failed or timed out"
crawl_df = pd.read_json(output_file, lines=True)
if crawl_df.empty:
return "", "No data found for the URL"
# Extract title and description
title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled"
meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else ""
# Process links
links_content = process_links(crawl_df, link_types)
# Generate final markdown
content = f"# {title}\n\n"
if meta_desc:
content += f"> {meta_desc}\n\n"
content += links_content
return content, f"Successfully processed {url}"
except Exception as e:
logger.error(f"Error processing {url}: {str(e)}")
return "", f"Error: {str(e)}"
finally:
if os.path.exists(output_file):
os.remove(output_file)
def process_file(file: gr.File) -> Tuple[str, str]:
"""Convert uploaded file to markdown."""
if not file:
return "", "No file uploaded"
supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'}
file_ext = os.path.splitext(file.name)[1].lower()
if file_ext not in supported_extensions:
return "", f"Unsupported file type: {file_ext}"
try:
result = md_converter.convert(file.name)
return result.text_content, "File processed successfully"
except Exception as e:
logger.error(f"File processing error: {str(e)}")
return "", f"Error processing file: {str(e)}"
# Custom CSS for styling
css = """
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
body {
font-family: 'Open Sans', sans-serif !important;
}
.primary-btn {
background-color: #3452db !important;
}
.primary-btn:hover {
background-color: #2a41af !important;
}
"""
# Create a custom theme
theme = gr.themes.Soft(
primary_hue=gr.themes.colors.Color(
name="blue",
c50="#eef1ff",
c100="#e0e5ff",
c200="#c3cbff",
c300="#a5b2ff",
c400="#8798ff",
c500="#6a7eff",
c600="#3452db",
c700="#2a41af",
c800="#1f3183",
c900="#152156",
c950="#0a102b",
)
)
# Create interface
with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface:
gr.Markdown("# LLMs.txt Generator")
with gr.Tab("Website URL"):
url_input = gr.Textbox(
label="Website URL",
placeholder="example.com"
)
link_types = gr.Dropdown(
choices=["All links", "<header> links", "<nav> links", "<footer> links"],
multiselect=True,
value=["All links"],
label="Link Types to Extract"
)
url_button = gr.Button("Process URL", variant="primary")
url_output = gr.Textbox(
label="Generated Content",
lines=20,
show_copy_button=True
)
url_status = gr.Textbox(label="Status")
url_button.click(
process_url,
inputs=[url_input, link_types],
outputs=[url_output, url_status]
)
with gr.Tab("File Converter"):
file_input = gr.File(label="Upload Document")
file_button = gr.Button("Convert to Markdown", variant="primary")
file_output = gr.Textbox(
label="Converted Content",
lines=20,
show_copy_button=True
)
file_status = gr.Textbox(label="Status")
file_button.click(
process_file,
inputs=[file_input],
outputs=[file_output, file_status]
)
if __name__ == "__main__":
iface.launch()