Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 3 days ago

Commit

0221da4

verified ·

1 Parent(s): 2b3088a

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -104

app.py CHANGED Viewed

@@ -5,109 +5,141 @@ import re
 from secrets import token_hex
 import logging
 import os
-from markitdown import MarkItDown  # Import MarkItDown
 # Set up logging
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize the MarkItDown converter
 md_converter = MarkItDown()
-def safe_crawl(url, output_file):
-    """Safely perform a web crawl with a timeout"""
     try:
-        adv.crawl(url, output_file,
-                  follow_links=False,  # Only crawl the main page
-                  custom_settings={'CLOSESPIDER_TIMEOUT': 30})  # 30-second timeout
         return True
     except Exception as e:
-        logger.error(f"Crawl error: {str(e)}")
         return False
-def explode_link_df(crawl_df, col_group):
-    """Process links from a specific column group in the crawl dataframe"""
     try:
-        link = crawl_df[f'{col_group}_links_url'].str.split('@@').explode()
-        text = crawl_df[f'{col_group}_links_text'].str.split('@@').explode()
         all_links = []
-        for link, text in zip(link.dropna(), text.dropna()):
-            if text and text.strip():
-                text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
-                text = re.sub(r"\s{3,}", " ", text)
-                all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
         return "\n\n".join(all_links)
     except Exception as e:
-        logger.error(f"Error processing {col_group} links: {str(e)}")
         return ""
-def process_url(url, link_types):
-    """Process a website URL and generate the llms.txt content"""
-    if not url:
-        return "", "Please enter a URL"
     try:
-        if not url.startswith(("http://", "https://")):
-            url = "https://" + url
-        # Generate a unique filename for this crawl
-        output_file = token_hex(6)
-        jsonl_path = f"{output_file}.jsonl"
-        try:
-            if not safe_crawl(url, jsonl_path):
-                return "", "Crawl failed or timed out"
-            crawl_df = pd.read_json(jsonl_path, lines=True)
-        finally:
-            if os.path.exists(jsonl_path):
-                os.remove(jsonl_path)
         if crawl_df.empty:
-            return "", "Crawl produced no data for the URL."
-        # Use default values if the expected columns are missing or empty
-        title = "Untitled"
-        meta_desc = ""
-        if 'title' in crawl_df.columns and not pd.isna(crawl_df['title'].iloc[0]):
-            title = crawl_df['title'].iloc[0]
-        if 'meta_desc' in crawl_df.columns and not pd.isna(crawl_df['meta_desc'].iloc[0]):
-            meta_desc = crawl_df['meta_desc'].iloc[0]
-        all_links = []
-        if link_types and "All links" not in link_types:
-            for link_type in link_types:
-                type_match = re.findall(r"header|footer|nav", link_type)
-                if type_match:
-                    link_content = explode_link_df(crawl_df, type_match[0])
-                    if link_content:
-                        all_links.append(link_content)
-                        all_links.append('\n\n')
-        else:
-            # Process all links using advertools
-            link_df = adv.crawlytics.links(crawl_df)
-            for link, text in link_df[['link', 'text']].values:
-                if text and text.strip():
-                    text = re.sub(r"\n+", " ", text.strip(), re.DOTALL)
-                    text = re.sub(r"\s{3,}", " ", text)
-                    all_links.append("\n".join(['## ' + text, f"[{text}]({link})"]))
-        links_text = "\n\n".join(all_links)
-        final_content = f"# {title}\n\n> {meta_desc}\n\n{links_text}"
-        return final_content, f"Successfully crawled website. Found {len(all_links)} sections."
     except Exception as e:
-        logger.error(f"Error processing URL {url}: {str(e)}")
         return "", f"Error: {str(e)}"
-def process_file(file):
-    """Convert an uploaded file into Markdown using MarkItDown"""
     try:
         result = md_converter.convert(file.name)
-        return result.text_content, "File processed successfully."
     except Exception as e:
-        logger.error(f"Error processing file {file.name}: {str(e)}")
         return "", f"Error processing file: {str(e)}"
 # Custom CSS for styling
@@ -145,52 +177,50 @@ theme = gr.themes.Soft(
     )
 )
-with gr.Blocks(theme=theme, css=css) as iface:
-    gr.Markdown("# Generate an `llms.txt` file for GenAI Search and Agents")
-    with gr.Tab("Website URL Processing"):
         url_input = gr.Textbox(
-            label="Enter the home page of a website:",
-            placeholder="example: https://example.com",
-            lines=1,
         )
         link_types = gr.Dropdown(
-            label="Select types of links to extract (leave empty to get all links)",
-            choices=["<header> links", "<nav> links", "<footer> links", "All links"],
             multiselect=True,
-            value=["All links"]
         )
-        generate_btn = gr.Button("Submit", variant="primary", elem_classes=["primary-btn"])
-        output = gr.Textbox(
-            label="Generated llms.txt Content",
             lines=20,
-            show_copy_button=True,
-            container=True,
         )
-        status = gr.Textbox(label="Status", interactive=False)
-        generate_btn.click(
-            fn=process_url,
             inputs=[url_input, link_types],
-            outputs=[output, status],
         )
-    with gr.Tab("File to Markdown Converter"):
-        file_input = gr.File(label="Upload a file (e.g., PDF, DOCX, PPTX, etc.)")
-        convert_btn = gr.Button("Convert to Markdown", variant="primary", elem_classes=["primary-btn"])
         file_output = gr.Textbox(
-            label="Converted Markdown (llms.txt content)",
             lines=20,
-            show_copy_button=True,
-            container=True,
         )
-        file_status = gr.Textbox(label="Status", interactive=False)
-        convert_btn.click(
-            fn=process_file,
             inputs=[file_input],
-            outputs=[file_output, file_status],
         )
 if __name__ == "__main__":
     iface.launch()

 from secrets import token_hex
 import logging
 import os
+from markitdown import MarkItDown
+from typing import Tuple, List, Optional
+import validators
 # Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
 logger = logging.getLogger(__name__)
+# Initialize MarkItDown
 md_converter = MarkItDown()
+def validate_url(url: str) -> Tuple[bool, str]:
+    """Validate URL format and accessibility."""
+    if not url:
+        return False, "URL is required"
+    if not url.startswith(('http://', 'https://')):
+        url = 'https://' + url
+    if not validators.url(url):
+        return False, "Invalid URL format"
+    return True, url
+def safe_crawl(url: str, output_file: str) -> bool:
+    """Safely perform a web crawl with timeout and error handling."""
     try:
+        adv.crawl(
+            url,
+            output_file,
+            follow_links=False,
+            custom_settings={
+                'CLOSESPIDER_TIMEOUT': 30,
+                'ROBOTSTXT_OBEY': True,
+                'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
+                'USER_AGENT': 'Mozilla/5.0 (compatible; LLMContentBot/1.0)',
+                'DOWNLOAD_TIMEOUT': 10
+            }
+        )
         return True
     except Exception as e:
+        logger.error(f"Crawl error for {url}: {str(e)}")
         return False
+def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
+    """Process links based on selected types."""
     try:
         all_links = []
+        if "All links" in link_types or not link_types:
+            link_df = adv.crawlytics.links(crawl_df)
+            for link, text in link_df[['link', 'text']].dropna().values:
+                if text := text.strip():
+                    text = re.sub(r'[\n\s]+', ' ', text)
+                    all_links.append(f"## {text}\n[{text}]({link})")
+        else:
+            for link_type in link_types:
+                type_match = re.findall(r"header|footer|nav", link_type.lower())
+                if type_match:
+                    col_prefix = type_match[0]
+                    urls = crawl_df[f'{col_prefix}_links_url'].iloc[0]
+                    texts = crawl_df[f'{col_prefix}_links_text'].iloc[0]
+                    if urls and texts:
+                        urls = urls.split('@@')
+                        texts = texts.split('@@')
+                        for url, text in zip(urls, texts):
+                            if text := text.strip():
+                                text = re.sub(r'[\n\s]+', ' ', text)
+                                all_links.append(f"## {text}\n[{text}]({url})")
         return "\n\n".join(all_links)
     except Exception as e:
+        logger.error(f"Link processing error: {str(e)}")
         return ""
+def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
+    """Process website URL and generate markdown content."""
+    valid, result = validate_url(url)
+    if not valid:
+        return "", result
+    url = result
+    output_file = f"crawl_{token_hex(6)}.jsonl"
     try:
+        if not safe_crawl(url, output_file):
+            return "", "Crawl failed or timed out"
+        crawl_df = pd.read_json(output_file, lines=True)
         if crawl_df.empty:
+            return "", "No data found for the URL"
+        # Extract title and description
+        title = crawl_df['title'].iloc[0] if 'title' in crawl_df.columns else "Untitled"
+        meta_desc = crawl_df['meta_desc'].iloc[0] if 'meta_desc' in crawl_df.columns else ""
+        # Process links
+        links_content = process_links(crawl_df, link_types)
+        # Generate final markdown
+        content = f"# {title}\n\n"
+        if meta_desc:
+            content += f"> {meta_desc}\n\n"
+        content += links_content
+        return content, f"Successfully processed {url}"
     except Exception as e:
+        logger.error(f"Error processing {url}: {str(e)}")
         return "", f"Error: {str(e)}"
+    finally:
+        if os.path.exists(output_file):
+            os.remove(output_file)
+def process_file(file: gr.File) -> Tuple[str, str]:
+    """Convert uploaded file to markdown."""
+    if not file:
+        return "", "No file uploaded"
+    supported_extensions = {'.pdf', '.docx', '.pptx', '.xlsx', '.html', '.txt'}
+    file_ext = os.path.splitext(file.name)[1].lower()
+    if file_ext not in supported_extensions:
+        return "", f"Unsupported file type: {file_ext}"
     try:
         result = md_converter.convert(file.name)
+        return result.text_content, "File processed successfully"
     except Exception as e:
+        logger.error(f"File processing error: {str(e)}")
         return "", f"Error processing file: {str(e)}"
 # Custom CSS for styling
     )
 )
+# Create interface
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as iface:
+    gr.Markdown("# LLMs.txt Generator")
+    with gr.Tab("Website URL"):
         url_input = gr.Textbox(
+            label="Website URL",
+            placeholder="example.com"
         )
         link_types = gr.Dropdown(
+            choices=["All links", "<header> links", "<nav> links", "<footer> links"],
             multiselect=True,
+            value=["All links"],
+            label="Link Types to Extract"
         )
+        url_button = gr.Button("Process URL", variant="primary")
+        url_output = gr.Textbox(
+            label="Generated Content",
             lines=20,
+            show_copy_button=True
         )
+        url_status = gr.Textbox(label="Status")
+        url_button.click(
+            process_url,
             inputs=[url_input, link_types],
+            outputs=[url_output, url_status]
         )
+    with gr.Tab("File Converter"):
+        file_input = gr.File(label="Upload Document")
+        file_button = gr.Button("Convert to Markdown", variant="primary")
         file_output = gr.Textbox(
+            label="Converted Content",
             lines=20,
+            show_copy_button=True
         )
+        file_status = gr.Textbox(label="Status")
+        file_button.click(
+            process_file,
             inputs=[file_input],
+            outputs=[file_output, file_status]
         )
 if __name__ == "__main__":
     iface.launch()