Spaces:

Nymbo
/

Tools

Running

App Files Files Community

Nymbo commited on 24 days ago

Commit

7c1a6bf

verified ·

1 Parent(s): ed31cb7

lots of web search improvements

Browse files

Files changed (1) hide show

app.py +184 -42

app.py CHANGED Viewed

@@ -13,19 +13,23 @@ import json
 import sys
 import os
 import random
 from io import StringIO
 from typing import List, Dict, Tuple, Annotated
 import gradio as gr
 import requests
 from bs4 import BeautifulSoup
-from markdownify import markdownify as md
-from readability import Document
 from urllib.parse import urljoin, urldefrag, urlparse
-from duckduckgo_search import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
-import time
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
@@ -40,6 +44,94 @@ except Exception:  # pragma: no cover - optional dependency
     KPipeline = None  # type: ignore
 # ==============================
 # Fetch: HTTP + extraction utils
 # ==============================
@@ -47,14 +139,18 @@ except Exception:  # pragma: no cover - optional dependency
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
     (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
     headers = {
-        "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
         "Accept-Language": "en-US,en;q=0.9",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
-    return requests.get(url, headers=headers, timeout=15)
 def _normalize_whitespace(text: str) -> str:
@@ -160,8 +256,8 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
     # Parse simplified HTML
     s = BeautifulSoup(readable_html, "lxml")
-    # Remove noisy tags
-    for sel in ["script", "style", "noscript", "iframe", "svg"]:
         for tag in s.select(sel):
             tag.decompose()
@@ -357,17 +453,30 @@ def Fetch_Webpage(  # <-- MCP tool #1 (Fetch)
     try:
         resp = _http_get(url)
         resp.raise_for_status()
     except requests.exceptions.RequestException as e:
-        return f"An error occurred: {e}"
     final_url = str(resp.url)
     ctype = resp.headers.get("Content-Type", "")
     if "html" not in ctype.lower():
-        return f"Unsupported content type for extraction: {ctype or 'unknown'}"
-    # Decode to text
-    resp.encoding = resp.encoding or resp.apparent_encoding
-    html = resp.text
     # Full-page soup for metadata (and potential Markdown conversion)
     full_soup = BeautifulSoup(html, "lxml")
@@ -418,10 +527,11 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
 ) -> str:
     """
-    Run a DuckDuckGo search and return ultra-compact JSONL with short keys to
-    minimize tokens.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
@@ -430,24 +540,36 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
     Returns:
-        str: Newline-delimited JSON (JSONL). Each line has:
-            {"t": "title", "u": "url"[, "s": "snippet"]}
     """
     if not query or not query.strip():
-        return ""
     try:
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
-        return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
     seen_domains = set()
-    lines: List[str] = []
-    for r in raw or []:
         title = _shorten((r.get("title") or "").strip(), title_chars)
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
@@ -461,16 +583,35 @@ def Search_DuckDuckGo(  # <-- MCP tool #2 (DDG Search)
                 continue
             seen_domains.add(dom)
-        obj = {"t": title or _domain_of(url), "u": url}
-        if include_snippets and body:
-            obj["s"] = _shorten(body, max_snippet_chars)
-        # Emit most compact JSON possible (no spaces)
-        lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
-    # Join as JSONL (each result on its own line)
-    return "\n".join(lines)
 # ======================================
@@ -749,7 +890,7 @@ fetch_interface = gr.Interface(
     "'Full-page Markdown (Content Scraper mode)' option to return the page "
     "converted to Markdown."
     ),
-    allow_flagging="never",
 )
 # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
@@ -762,18 +903,19 @@ concise_interface = gr.Interface(
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
     ],
-    outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
     title="DuckDuckGo Search",
     description=(
-        "<div style=\"text-align:center\">Very concise web search to avoid unnecessary context. Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.</div>"
     ),
     api_description=(
-        "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
-        "t=title, u=url, optional s=snippet. Options control result count, "
-        "snippet inclusion and length, domain deduping, and title length."
     ),
-    allow_flagging="never",
     submit_btn="Search",
 )
@@ -795,7 +937,7 @@ code_interface = gr.Interface(
         "Returns:\n"
         "- string: Combined stdout produced by the code, or the exception text if execution failed."
     ),
-    allow_flagging="never",
 )
 CSS_STYLES = """
@@ -860,7 +1002,7 @@ kokoro_interface = gr.Interface(
         "Can generate audio of unlimited length by processing all text segments. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    allow_flagging="never",
 )
 # ==========================
@@ -986,7 +1128,7 @@ image_generation_interface = gr.Interface(
         "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    allow_flagging="never",
 )
 # ==========================
@@ -1163,7 +1305,7 @@ video_generation_interface = gr.Interface(
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
         "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
     ),
-    allow_flagging="never",
 )
 # Build tabbed app; disable Image/Video tools if no HF token is present

 import sys
 import os
 import random
+import asyncio
+import time
 from io import StringIO
 from typing import List, Dict, Tuple, Annotated
+from datetime import datetime, timedelta
+from dataclasses import dataclass
 import gradio as gr
 import requests
+import httpx
 from bs4 import BeautifulSoup
+from markdownify import markdownify as md # type: ignore
+from readability import Document # type: ignore
 from urllib.parse import urljoin, urldefrag, urlparse
+from ddgs import DDGS
 from PIL import Image
 from huggingface_hub import InferenceClient
 # Optional imports for Kokoro TTS (loaded lazily)
 import numpy as np
     KPipeline = None  # type: ignore
+# ==============================
+# Rate Limiting and HTTP Utils
+# ==============================
+@dataclass
+class SearchResult:
+    title: str
+    link: str
+    snippet: str
+    position: int
+class RateLimiter:
+    """Rate limiter to prevent being blocked by services"""
+    def __init__(self, requests_per_minute: int = 30):
+        self.requests_per_minute = requests_per_minute
+        self.requests = []
+    async def acquire(self):
+        now = datetime.now()
+        # Remove requests older than 1 minute
+        self.requests = [
+            req for req in self.requests if now - req < timedelta(minutes=1)
+        ]
+        if len(self.requests) >= self.requests_per_minute:
+            # Wait until we can make another request
+            wait_time = 60 - (now - self.requests[0]).total_seconds()
+            if wait_time > 0:
+                await asyncio.sleep(wait_time)
+        self.requests.append(now)
+class ImprovedWebFetcher:
+    """Improved web fetcher with rate limiting and async support"""
+    def __init__(self):
+        self.rate_limiter = RateLimiter(requests_per_minute=20)
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept-Language": "en-US,en;q=0.9",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        }
+    async def fetch_async(self, url: str) -> Tuple[str, str, str]:
+        """
+        Fetch URL with rate limiting and proper error handling
+        Returns: (html_content, final_url, error_message)
+        """
+        await self.rate_limiter.acquire()
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(
+                    url,
+                    headers=self.headers,
+                    follow_redirects=True,
+                    timeout=30.0,
+                )
+                response.raise_for_status()
+                return response.text, str(response.url), ""
+        except httpx.TimeoutException:
+            return "", "", f"Request timed out for URL: {url}"
+        except httpx.HTTPError as e:
+            return "", "", f"HTTP error occurred: {str(e)}"
+        except Exception as e:
+            return "", "", f"Unexpected error: {str(e)}"
+# Global instances
+_web_fetcher = ImprovedWebFetcher()
+_search_rate_limiter = RateLimiter(requests_per_minute=30)
+# Simple sync rate limiting for backwards compatibility
+_last_request_time = 0
+_min_request_interval = 2  # seconds between requests
+def _apply_rate_limit():
+    """Simple synchronous rate limiting"""
+    global _last_request_time
+    current_time = time.time()
+    elapsed = current_time - _last_request_time
+    if elapsed < _min_request_interval:
+        time.sleep(_min_request_interval - elapsed)
+    _last_request_time = time.time()
 # ==============================
 # Fetch: HTTP + extraction utils
 # ==============================
 def _http_get(url: str) -> requests.Response:
     """
     Download the page politely with a short timeout and realistic headers.
+    Enhanced with better error handling, headers from ddg-search patterns, and rate limiting.
     (Layman's terms: grab the web page like a normal browser would, but quickly.)
     """
+    # Apply rate limiting to avoid being blocked
+    _apply_rate_limit()
     headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
         "Accept-Language": "en-US,en;q=0.9",
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
     }
+    return requests.get(url, headers=headers, timeout=30, allow_redirects=True)
 def _normalize_whitespace(text: str) -> str:
     # Parse simplified HTML
     s = BeautifulSoup(readable_html, "lxml")
+    # Remove noisy tags (improved from ddg-search patterns)
+    for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside"]:
         for tag in s.select(sel):
             tag.decompose()
     try:
         resp = _http_get(url)
         resp.raise_for_status()
+    except requests.exceptions.Timeout:
+        return f"Error: Request timed out while fetching {url}. Please try again or check if the website is accessible."
+    except requests.exceptions.ConnectionError:
+        return f"Error: Could not connect to {url}. Please check the URL and your internet connection."
+    except requests.exceptions.HTTPError as e:
+        return f"Error: HTTP {e.response.status_code} - {e.response.reason} when accessing {url}"
     except requests.exceptions.RequestException as e:
+        return f"Error: Failed to fetch webpage ({str(e)}). Please check the URL and try again."
+    except Exception as e:
+        return f"Error: An unexpected error occurred while fetching the webpage ({str(e)})"
     final_url = str(resp.url)
     ctype = resp.headers.get("Content-Type", "")
     if "html" not in ctype.lower():
+        return f"Unsupported content type for extraction: {ctype or 'unknown'}. This tool only works with HTML pages."
+    # Decode to text with better encoding detection
+    if not resp.encoding:
+        resp.encoding = resp.apparent_encoding or 'utf-8'
+    try:
+        html = resp.text
+    except UnicodeDecodeError:
+        # Fallback encoding handling
+        html = resp.content.decode('utf-8', errors='replace')
     # Full-page soup for metadata (and potential Markdown conversion)
     full_soup = BeautifulSoup(html, "lxml")
     max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
     dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
     title_chars: Annotated[int, "Character cap applied to titles."] = 80,
+    output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
 ) -> str:
     """
+    Run a DuckDuckGo search and return results in ultra-compact JSONL format
+    or readable format optimized for LLM consumption.
     Args:
         query: The search query (supports operators like site:, quotes, OR).
         max_snippet_chars: Character cap applied to each snippet when included.
         dedupe_domains: If true, only the first result from each domain is kept.
         title_chars: Character cap applied to titles.
+        output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
     Returns:
+        str: Either JSONL with {"t": "title", "u": "url"[, "s": "snippet"]} per line
+        or readable text format optimized for LLM processing.
     """
     if not query or not query.strip():
+        return "No search query provided." if output_format == "readable" else ""
+    # Apply rate limiting to avoid being blocked by DuckDuckGo
+    _apply_rate_limit()
     try:
         with DDGS() as ddgs:
             raw = ddgs.text(query, max_results=max_results)
     except Exception as e:
+        error_msg = f"Search failed: {str(e)[:120]}"
+        if output_format == "readable":
+            return f"Error: {error_msg}. This could be due to DuckDuckGo's bot detection or network issues. Please try rephrasing your search or try again in a few minutes."
+        return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
+    if not raw:
+        if output_format == "readable":
+            return f"No results found for query: {query}. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
+        return ""
     seen_domains = set()
+    results = []
+    for i, r in enumerate(raw or []):
         title = _shorten((r.get("title") or "").strip(), title_chars)
         url = (r.get("href") or r.get("link") or "").strip()
         body = (r.get("body") or r.get("snippet") or "").strip()
                 continue
             seen_domains.add(dom)
+        if output_format == "readable":
+            results.append({
+                "position": len(results) + 1,
+                "title": title or _domain_of(url),
+                "url": url,
+                "snippet": _shorten(body, max_snippet_chars) if include_snippets and body else ""
+            })
+        else:
+            obj = {"t": title or _domain_of(url), "u": url}
+            if include_snippets and body:
+                obj["s"] = _shorten(body, max_snippet_chars)
+            results.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
+    if output_format == "readable":
+        if not results:
+            return f"No results found for query: {query}"
+        output = [f"Found {len(results)} search results for: {query}\n"]
+        for result in results:
+            output.append(f"{result['position']}. {result['title']}")
+            output.append(f"   URL: {result['url']}")
+            if result['snippet']:
+                output.append(f"   Summary: {result['snippet']}")
+            output.append("")  # Empty line between results
+        return "\n".join(output).rstrip()
+    else:
+        # Return JSONL format (original behavior)
+        return "\n".join(results)
 # ======================================
     "'Full-page Markdown (Content Scraper mode)' option to return the page "
     "converted to Markdown."
     ),
+    flagging_mode="never",
 )
 # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
         gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
         gr.Checkbox(value=True, label="Dedupe by domain"),
         gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
+        gr.Dropdown(label="Output Format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact output, Readable for LLM-friendly format"),
     ],
+    outputs=gr.Textbox(label="Search Results", interactive=False),
     title="DuckDuckGo Search",
     description=(
+        "<div style=\"text-align:center\">Enhanced web search with improved error handling and LLM-friendly output options. Choose JSONL for compact results or Readable format for better LLM processing.</div>"
     ),
     api_description=(
+        "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
+        "Returns either compact JSONL (t=title, u=url, optional s=snippet) or "
+        "readable text format optimized for LLM consumption with better error messages."
     ),
+    flagging_mode="never",
     submit_btn="Search",
 )
         "Returns:\n"
         "- string: Combined stdout produced by the code, or the exception text if execution failed."
     ),
+    flagging_mode="never",
 )
 CSS_STYLES = """
         "Can generate audio of unlimited length by processing all text segments. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    flagging_mode="never",
 )
 # ==========================
         "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
         "Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    flagging_mode="never",
 )
 # ==========================
         "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
         "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
     ),
+    flagging_mode="never",
 )
 # Build tabbed app; disable Image/Video tools if no HF token is present