import gradio as gr
import advertools as adv
import pandas as pd
import re
from secrets import token_hex
import logging
import os
from markitdown import MarkItDown
from typing import Tuple, List, Optional
import validators
# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Initialize MarkItDown
md_converter = MarkItDown()
def validate_url(url: str) -> Tuple[bool, str]:
    """Validate URL format and accessibility."""
    if not url:
        return False, "URL is required"
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    if not validators.url(url):
        return False, "Invalid URL format"
    return True, url
def safe_crawl(url: str, output_file: str) -> bool:
    """Safely perform a web crawl with timeout and error handling."""
    try:
        adv.crawl(
            url,
            output_file,
            follow_links=False,
            custom_settings={
                "CLOSESPIDER_TIMEOUT": 30,
                "ROBOTSTXT_OBEY": True,
                "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
                "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
                "DOWNLOAD_TIMEOUT": 10,
            },
        )
        return True
    except Exception as e:
        logger.error(f"Crawl error for {url}: {str(e)}")
        return False
def clean_text(text: str) -> str:
    """Clean and format text by removing extra whitespace and normalizing spacing."""
    if not text:
        return ""
    # Remove extra whitespace and newlines
    text = re.sub(r"[\n\s]+", " ", text)
    # Split camelCase words
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    # Clean extra spaces
    text = " ".join(text.split())
    return text.strip()
def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
    """Process a single link-text pair and return markdown if valid."""
    if not url or not text:
        return None
    url = url.strip()
    text = clean_text(text)
    if not text or not url or url in seen_links:
        return None
    seen_links.add(url)
    return f"## {text}\n[{text}]({url})"
def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
    """Process links based on selected types with deduplication."""
    try:
        all_links = []
        seen_links = set()  # Track unique URLs
        if "All links" in link_types or not link_types:
            link_df = adv.crawlytics.links(crawl_df)
            for link, text in link_df[["link", "text"]].dropna().values:
                if md_link := process_link_pair(link, text, seen_links):
                    all_links.append(md_link)
        else:
            for link_type in link_types:
                type_match = re.findall(r"header|footer|nav", link_type.lower())
                if type_match:
                    col_prefix = type_match[0]
                    urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
                    texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]
                    if urls and texts:
                        urls = urls.split("@@")
                        texts = texts.split("@@")
                        for url, text in zip(urls, texts):
                            if md_link := process_link_pair(url, text, seen_links):
                                all_links.append(md_link)
        return "\n\n".join(all_links)
    except Exception as e:
        logger.error(f"Link processing error: {str(e)}")
        return ""
def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
    """Process website URL and generate markdown content."""
    valid, result = validate_url(url)
    if not valid:
        return "", result
    url = result
    output_file = f"crawl_{token_hex(6)}.jsonl"
    try:
        if not safe_crawl(url, output_file):
            return "", "Crawl failed or timed out"
        crawl_df = pd.read_json(output_file, lines=True)
        if crawl_df.empty:
            return "", "No data found for the URL"
        # Extract and clean title and description
        title = (
            clean_text(crawl_df["title"].iloc[0])
            if "title" in crawl_df.columns
            else "Untitled"
        )
        meta_desc = (
            clean_text(crawl_df["meta_desc"].iloc[0])
            if "meta_desc" in crawl_df.columns
            else ""
        )
        # Process links
        links_content = process_links(crawl_df, link_types)
        # Generate final markdown
        content = f"# {title}\n\n"
        if meta_desc:
            content += f"> {meta_desc}\n\n"
        content += links_content
        return content, f"Successfully processed {url}"
    except Exception as e:
        logger.error(f"Error processing {url}: {str(e)}")
        return "", f"Error: {str(e)}"
    finally:
        if os.path.exists(output_file):
            os.remove(output_file)
def process_file(file: gr.File) -> Tuple[str, str]:
    """Convert uploaded file to markdown."""
    if not file:
        return "", "No file uploaded"
    supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
    file_ext = os.path.splitext(file.name)[1].lower()
    if file_ext not in supported_extensions:
        return "", f"Unsupported file type: {file_ext}"
    try:
        result = md_converter.convert(file.name)
        return result.text_content, "File processed successfully"
    except Exception as e:
        logger.error(f"File processing error: {str(e)}")
        return "", f"Error processing file: {str(e)}"
# Custom CSS for styling
css = """
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');
body {
    font-family: 'Open Sans', sans-serif !important;
}
.primary-btn {
    background-color: #3452db !important;
}
.primary-btn:hover {
    background-color: #2a41af !important;
}
"""
# Create a custom theme
theme = gr.themes.Soft(
    primary_hue=gr.themes.colors.Color(
        name="blue",
        c50="#eef1ff",
        c100="#e0e5ff",
        c200="#c3cbff",
        c300="#a5b2ff",
        c400="#8798ff",
        c500="#6a7eff",
        c600="#3452db",
        c700="#2a41af",
        c800="#1f3183",
        c900="#152156",
        c950="#0a102b",
    )
)
# Create interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    css=css,
    head="""
        
        
        
        
        
    """,
) as iface:
    gr.Markdown("# LLMs.txt Generator")
    with gr.Tab("Website URL"):
        url_input = gr.Textbox(label="Website URL", placeholder="example.com")
        link_types = gr.Dropdown(
            choices=["All links", " links", "