import gradio as gr import advertools as adv import pandas as pd import re from secrets import token_hex import logging import os from markitdown import MarkItDown from typing import Tuple, List, Optional import validators # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Initialize MarkItDown md_converter = MarkItDown() def validate_url(url: str) -> Tuple[bool, str]: """Validate URL format and accessibility.""" if not url: return False, "URL is required" if not url.startswith(('http://', 'https://')): url = 'https://' + url if not validators.url(url): return False, "Invalid URL format" return True, url def safe_crawl(url: str, output_file: str) -> bool: """Safely perform a web crawl with timeout and error handling.""" try: adv.crawl( url, output_file, follow_links=False, custom_settings={ 'CLOSESPIDER_TIMEOUT': 30, 'ROBOTSTXT_OBEY': True, 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, 'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)', 'DOWNLOAD_TIMEOUT': 10 } ) return True except Exception as e: logger.error(f"Crawl error for {url}: {str(e)}") return False def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str: """Process links based on selected types.""" try: all_links = [] if "All links" in link_types or not link_types: link_df = adv.crawlytics.links(crawl_df) for link, text in link_df[['link', 'text']].dropna().values: if text := text.strip(): text = re.sub(r'[\n\s]+', ' ', text) all_links.append(f"## {text}\n[{text}]({link})") else: for link_type in link_types: type_match = re.findall(r"header|footer|nav", link_type.lower()) if type_match: col_prefix = type_match[0] urls = crawl_df[f'{col_prefix}_links_url'].iloc[0] texts = crawl_df[f'{col_prefix}_links_text'].iloc[0] if urls and texts: urls = urls.split('@@') texts = texts.split('@@') for url, text in zip(urls, texts): if text := text.strip(): text = re.sub(r'[\n\s]+', ' ', text) all_links.append(f"## {text}\n[{text}]({url})") return "\n\n".join(all_links) except Exception as e: logger.error(f"Link processing error: {str(e)}") return "" def process_url(url: str, link_types: List[str]) -> Tuple[str, str]: """Process website URL and generate markdown content.""" valid, result = validate_url(url) if not valid: return "", result url = result output_file = f"crawl_{token_hex(6)}.jsonl" try: if not safe_crawl(url, output_file): return "", "Crawl failed or timed out" crawl_df = pd.read_json(output_file, lines=True) if crawl_df.empty: return "", "No data found for the URL" # Extract title and description title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled" meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else "" # Process links links_content = process_links(crawl_df, link_types) # Generate final markdown content = f"# {title}\n\n" if meta_desc: content += f"> {meta_desc}\n\n" content += links_content return content, f"Successfully processed {url}" except Exception as e: logger.error(f"Error processing {url}: {str(e)}") return "", f"Error: {str(e)}" finally: if os.path.exists(output_file): os.remove(output_file) def process_file(file: gr.File) -> Tuple[str, str]: """Convert uploaded file to markdown.""" if not file: return "", "No file uploaded" supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'} file_ext = os.path.splitext(file.name)[1].lower() if file_ext not in supported_extensions: return "", f"Unsupported file type: {file_ext}" try: result = md_converter.convert(file.name) return result.text_content, "File processed successfully" except Exception as e: logger.error(f"File processing error: {str(e)}") return "", f"Error processing file: {str(e)}" # Custom CSS for styling css = """ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap'); body { font-family: 'Open Sans', sans-serif !important; } .primary-btn { background-color: #3452db !important; } .primary-btn:hover { background-color: #2a41af !important; } """ # Create a custom theme theme = gr.themes.Soft( primary_hue=gr.themes.colors.Color( name="blue", c50="#eef1ff", c100="#e0e5ff", c200="#c3cbff", c300="#a5b2ff", c400="#8798ff", c500="#6a7eff", c600="#3452db", c700="#2a41af", c800="#1f3183", c900="#152156", c950="#0a102b", ) ) # Create interface with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface: gr.Markdown("# LLMs.txt Generator") with gr.Tab("Website URL"): url_input = gr.Textbox( label="Website URL", placeholder="example.com" ) link_types = gr.Dropdown( choices=["All links", "
links", "