Nymbo commited on
Commit
7c1a6bf
·
verified ·
1 Parent(s): ed31cb7

lots of web search improvements

Browse files
Files changed (1) hide show
  1. app.py +184 -42
app.py CHANGED
@@ -13,19 +13,23 @@ import json
13
  import sys
14
  import os
15
  import random
 
 
16
  from io import StringIO
17
  from typing import List, Dict, Tuple, Annotated
 
 
18
 
19
  import gradio as gr
20
  import requests
 
21
  from bs4 import BeautifulSoup
22
- from markdownify import markdownify as md
23
- from readability import Document
24
  from urllib.parse import urljoin, urldefrag, urlparse
25
- from duckduckgo_search import DDGS
26
  from PIL import Image
27
  from huggingface_hub import InferenceClient
28
- import time
29
 
30
  # Optional imports for Kokoro TTS (loaded lazily)
31
  import numpy as np
@@ -40,6 +44,94 @@ except Exception: # pragma: no cover - optional dependency
40
  KPipeline = None # type: ignore
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # ==============================
44
  # Fetch: HTTP + extraction utils
45
  # ==============================
@@ -47,14 +139,18 @@ except Exception: # pragma: no cover - optional dependency
47
  def _http_get(url: str) -> requests.Response:
48
  """
49
  Download the page politely with a short timeout and realistic headers.
 
50
  (Layman's terms: grab the web page like a normal browser would, but quickly.)
51
  """
 
 
 
52
  headers = {
53
- "User-Agent": "Mozilla/5.0 (compatible; WebMCP/1.0; +https://example.com)",
54
  "Accept-Language": "en-US,en;q=0.9",
55
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
56
  }
57
- return requests.get(url, headers=headers, timeout=15)
58
 
59
 
60
  def _normalize_whitespace(text: str) -> str:
@@ -160,8 +256,8 @@ def _extract_main_text(html: str) -> Tuple[str, BeautifulSoup]:
160
  # Parse simplified HTML
161
  s = BeautifulSoup(readable_html, "lxml")
162
 
163
- # Remove noisy tags
164
- for sel in ["script", "style", "noscript", "iframe", "svg"]:
165
  for tag in s.select(sel):
166
  tag.decompose()
167
 
@@ -357,17 +453,30 @@ def Fetch_Webpage( # <-- MCP tool #1 (Fetch)
357
  try:
358
  resp = _http_get(url)
359
  resp.raise_for_status()
 
 
 
 
 
 
360
  except requests.exceptions.RequestException as e:
361
- return f"An error occurred: {e}"
 
 
362
 
363
  final_url = str(resp.url)
364
  ctype = resp.headers.get("Content-Type", "")
365
  if "html" not in ctype.lower():
366
- return f"Unsupported content type for extraction: {ctype or 'unknown'}"
367
 
368
- # Decode to text
369
- resp.encoding = resp.encoding or resp.apparent_encoding
370
- html = resp.text
 
 
 
 
 
371
 
372
  # Full-page soup for metadata (and potential Markdown conversion)
373
  full_soup = BeautifulSoup(html, "lxml")
@@ -418,10 +527,11 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
418
  max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
419
  dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
420
  title_chars: Annotated[int, "Character cap applied to titles."] = 80,
 
421
  ) -> str:
422
  """
423
- Run a DuckDuckGo search and return ultra-compact JSONL with short keys to
424
- minimize tokens.
425
 
426
  Args:
427
  query: The search query (supports operators like site:, quotes, OR).
@@ -430,24 +540,36 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
430
  max_snippet_chars: Character cap applied to each snippet when included.
431
  dedupe_domains: If true, only the first result from each domain is kept.
432
  title_chars: Character cap applied to titles.
 
433
 
434
  Returns:
435
- str: Newline-delimited JSON (JSONL). Each line has:
436
- {"t": "title", "u": "url"[, "s": "snippet"]}
437
  """
438
  if not query or not query.strip():
439
- return ""
 
 
 
440
 
441
  try:
442
  with DDGS() as ddgs:
443
  raw = ddgs.text(query, max_results=max_results)
444
  except Exception as e:
445
- return json.dumps({"error": str(e)[:120]}, ensure_ascii=False, separators=(",", ":"))
 
 
 
 
 
 
 
 
446
 
447
  seen_domains = set()
448
- lines: List[str] = []
449
 
450
- for r in raw or []:
451
  title = _shorten((r.get("title") or "").strip(), title_chars)
452
  url = (r.get("href") or r.get("link") or "").strip()
453
  body = (r.get("body") or r.get("snippet") or "").strip()
@@ -461,16 +583,35 @@ def Search_DuckDuckGo( # <-- MCP tool #2 (DDG Search)
461
  continue
462
  seen_domains.add(dom)
463
 
464
- obj = {"t": title or _domain_of(url), "u": url}
465
-
466
- if include_snippets and body:
467
- obj["s"] = _shorten(body, max_snippet_chars)
468
-
469
- # Emit most compact JSON possible (no spaces)
470
- lines.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
471
-
472
- # Join as JSONL (each result on its own line)
473
- return "\n".join(lines)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
 
476
  # ======================================
@@ -749,7 +890,7 @@ fetch_interface = gr.Interface(
749
  "'Full-page Markdown (Content Scraper mode)' option to return the page "
750
  "converted to Markdown."
751
  ),
752
- allow_flagging="never",
753
  )
754
 
755
  # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
@@ -762,18 +903,19 @@ concise_interface = gr.Interface(
762
  gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
763
  gr.Checkbox(value=True, label="Dedupe by domain"),
764
  gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
 
765
  ],
766
- outputs=gr.Textbox(label="Results (JSONL)", interactive=False),
767
  title="DuckDuckGo Search",
768
  description=(
769
- "<div style=\"text-align:center\">Very concise web search to avoid unnecessary context. Emits JSONL with short keys (t,u[,s]). Defaults avoid snippets and duplicate domains.</div>"
770
  ),
771
  api_description=(
772
- "Run a DuckDuckGo search and return newline-delimited JSON with short keys: "
773
- "t=title, u=url, optional s=snippet. Options control result count, "
774
- "snippet inclusion and length, domain deduping, and title length."
775
  ),
776
- allow_flagging="never",
777
  submit_btn="Search",
778
  )
779
 
@@ -795,7 +937,7 @@ code_interface = gr.Interface(
795
  "Returns:\n"
796
  "- string: Combined stdout produced by the code, or the exception text if execution failed."
797
  ),
798
- allow_flagging="never",
799
  )
800
 
801
  CSS_STYLES = """
@@ -860,7 +1002,7 @@ kokoro_interface = gr.Interface(
860
  "Can generate audio of unlimited length by processing all text segments. "
861
  "Return the generated media to the user in this format `![Alt text](URL)`"
862
  ),
863
- allow_flagging="never",
864
  )
865
 
866
  # ==========================
@@ -986,7 +1128,7 @@ image_generation_interface = gr.Interface(
986
  "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
987
  "Return the generated media to the user in this format `![Alt text](URL)`"
988
  ),
989
- allow_flagging="never",
990
  )
991
 
992
  # ==========================
@@ -1163,7 +1305,7 @@ video_generation_interface = gr.Interface(
1163
  "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
1164
  "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
1165
  ),
1166
- allow_flagging="never",
1167
  )
1168
 
1169
  # Build tabbed app; disable Image/Video tools if no HF token is present
 
13
  import sys
14
  import os
15
  import random
16
+ import asyncio
17
+ import time
18
  from io import StringIO
19
  from typing import List, Dict, Tuple, Annotated
20
+ from datetime import datetime, timedelta
21
+ from dataclasses import dataclass
22
 
23
  import gradio as gr
24
  import requests
25
+ import httpx
26
  from bs4 import BeautifulSoup
27
+ from markdownify import markdownify as md # type: ignore
28
+ from readability import Document # type: ignore
29
  from urllib.parse import urljoin, urldefrag, urlparse
30
+ from ddgs import DDGS
31
  from PIL import Image
32
  from huggingface_hub import InferenceClient
 
33
 
34
  # Optional imports for Kokoro TTS (loaded lazily)
35
  import numpy as np
 
44
  KPipeline = None # type: ignore
45
 
46
 
47
+ # ==============================
48
+ # Rate Limiting and HTTP Utils
49
+ # ==============================
50
+
51
+ @dataclass
52
+ class SearchResult:
53
+ title: str
54
+ link: str
55
+ snippet: str
56
+ position: int
57
+
58
+
59
+ class RateLimiter:
60
+ """Rate limiter to prevent being blocked by services"""
61
+ def __init__(self, requests_per_minute: int = 30):
62
+ self.requests_per_minute = requests_per_minute
63
+ self.requests = []
64
+
65
+ async def acquire(self):
66
+ now = datetime.now()
67
+ # Remove requests older than 1 minute
68
+ self.requests = [
69
+ req for req in self.requests if now - req < timedelta(minutes=1)
70
+ ]
71
+
72
+ if len(self.requests) >= self.requests_per_minute:
73
+ # Wait until we can make another request
74
+ wait_time = 60 - (now - self.requests[0]).total_seconds()
75
+ if wait_time > 0:
76
+ await asyncio.sleep(wait_time)
77
+
78
+ self.requests.append(now)
79
+
80
+
81
+ class ImprovedWebFetcher:
82
+ """Improved web fetcher with rate limiting and async support"""
83
+
84
+ def __init__(self):
85
+ self.rate_limiter = RateLimiter(requests_per_minute=20)
86
+ self.headers = {
87
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
88
+ "Accept-Language": "en-US,en;q=0.9",
89
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
90
+ }
91
+
92
+ async def fetch_async(self, url: str) -> Tuple[str, str, str]:
93
+ """
94
+ Fetch URL with rate limiting and proper error handling
95
+ Returns: (html_content, final_url, error_message)
96
+ """
97
+ await self.rate_limiter.acquire()
98
+
99
+ try:
100
+ async with httpx.AsyncClient() as client:
101
+ response = await client.get(
102
+ url,
103
+ headers=self.headers,
104
+ follow_redirects=True,
105
+ timeout=30.0,
106
+ )
107
+ response.raise_for_status()
108
+ return response.text, str(response.url), ""
109
+
110
+ except httpx.TimeoutException:
111
+ return "", "", f"Request timed out for URL: {url}"
112
+ except httpx.HTTPError as e:
113
+ return "", "", f"HTTP error occurred: {str(e)}"
114
+ except Exception as e:
115
+ return "", "", f"Unexpected error: {str(e)}"
116
+
117
+ # Global instances
118
+ _web_fetcher = ImprovedWebFetcher()
119
+ _search_rate_limiter = RateLimiter(requests_per_minute=30)
120
+
121
+ # Simple sync rate limiting for backwards compatibility
122
+ _last_request_time = 0
123
+ _min_request_interval = 2 # seconds between requests
124
+
125
+ def _apply_rate_limit():
126
+ """Simple synchronous rate limiting"""
127
+ global _last_request_time
128
+ current_time = time.time()
129
+ elapsed = current_time - _last_request_time
130
+ if elapsed < _min_request_interval:
131
+ time.sleep(_min_request_interval - elapsed)
132
+ _last_request_time = time.time()
133
+
134
+
135
  # ==============================
136
  # Fetch: HTTP + extraction utils
137
  # ==============================
 
139
  def _http_get(url: str) -> requests.Response:
140
  """
141
  Download the page politely with a short timeout and realistic headers.
142
+ Enhanced with better error handling, headers from ddg-search patterns, and rate limiting.
143
  (Layman's terms: grab the web page like a normal browser would, but quickly.)
144
  """
145
+ # Apply rate limiting to avoid being blocked
146
+ _apply_rate_limit()
147
+
148
  headers = {
149
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
150
  "Accept-Language": "en-US,en;q=0.9",
151
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
152
  }
153
+ return requests.get(url, headers=headers, timeout=30, allow_redirects=True)
154
 
155
 
156
  def _normalize_whitespace(text: str) -> str:
 
256
  # Parse simplified HTML
257
  s = BeautifulSoup(readable_html, "lxml")
258
 
259
+ # Remove noisy tags (improved from ddg-search patterns)
260
+ for sel in ["script", "style", "noscript", "iframe", "svg", "nav", "header", "footer", "aside"]:
261
  for tag in s.select(sel):
262
  tag.decompose()
263
 
 
453
  try:
454
  resp = _http_get(url)
455
  resp.raise_for_status()
456
+ except requests.exceptions.Timeout:
457
+ return f"Error: Request timed out while fetching {url}. Please try again or check if the website is accessible."
458
+ except requests.exceptions.ConnectionError:
459
+ return f"Error: Could not connect to {url}. Please check the URL and your internet connection."
460
+ except requests.exceptions.HTTPError as e:
461
+ return f"Error: HTTP {e.response.status_code} - {e.response.reason} when accessing {url}"
462
  except requests.exceptions.RequestException as e:
463
+ return f"Error: Failed to fetch webpage ({str(e)}). Please check the URL and try again."
464
+ except Exception as e:
465
+ return f"Error: An unexpected error occurred while fetching the webpage ({str(e)})"
466
 
467
  final_url = str(resp.url)
468
  ctype = resp.headers.get("Content-Type", "")
469
  if "html" not in ctype.lower():
470
+ return f"Unsupported content type for extraction: {ctype or 'unknown'}. This tool only works with HTML pages."
471
 
472
+ # Decode to text with better encoding detection
473
+ if not resp.encoding:
474
+ resp.encoding = resp.apparent_encoding or 'utf-8'
475
+ try:
476
+ html = resp.text
477
+ except UnicodeDecodeError:
478
+ # Fallback encoding handling
479
+ html = resp.content.decode('utf-8', errors='replace')
480
 
481
  # Full-page soup for metadata (and potential Markdown conversion)
482
  full_soup = BeautifulSoup(html, "lxml")
 
527
  max_snippet_chars: Annotated[int, "Character cap applied to each snippet when included."] = 80,
528
  dedupe_domains: Annotated[bool, "If true, only the first result from each domain is kept."] = True,
529
  title_chars: Annotated[int, "Character cap applied to titles."] = 80,
530
+ output_format: Annotated[str, "Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text."] = "jsonl",
531
  ) -> str:
532
  """
533
+ Run a DuckDuckGo search and return results in ultra-compact JSONL format
534
+ or readable format optimized for LLM consumption.
535
 
536
  Args:
537
  query: The search query (supports operators like site:, quotes, OR).
 
540
  max_snippet_chars: Character cap applied to each snippet when included.
541
  dedupe_domains: If true, only the first result from each domain is kept.
542
  title_chars: Character cap applied to titles.
543
+ output_format: Output format: 'jsonl' for compact JSON or 'readable' for LLM-friendly text.
544
 
545
  Returns:
546
+ str: Either JSONL with {"t": "title", "u": "url"[, "s": "snippet"]} per line
547
+ or readable text format optimized for LLM processing.
548
  """
549
  if not query or not query.strip():
550
+ return "No search query provided." if output_format == "readable" else ""
551
+
552
+ # Apply rate limiting to avoid being blocked by DuckDuckGo
553
+ _apply_rate_limit()
554
 
555
  try:
556
  with DDGS() as ddgs:
557
  raw = ddgs.text(query, max_results=max_results)
558
  except Exception as e:
559
+ error_msg = f"Search failed: {str(e)[:120]}"
560
+ if output_format == "readable":
561
+ return f"Error: {error_msg}. This could be due to DuckDuckGo's bot detection or network issues. Please try rephrasing your search or try again in a few minutes."
562
+ return json.dumps({"error": error_msg}, ensure_ascii=False, separators=(",", ":"))
563
+
564
+ if not raw:
565
+ if output_format == "readable":
566
+ return f"No results found for query: {query}. This could be due to DuckDuckGo's bot detection or the query returned no matches. Please try rephrasing your search or try again in a few minutes."
567
+ return ""
568
 
569
  seen_domains = set()
570
+ results = []
571
 
572
+ for i, r in enumerate(raw or []):
573
  title = _shorten((r.get("title") or "").strip(), title_chars)
574
  url = (r.get("href") or r.get("link") or "").strip()
575
  body = (r.get("body") or r.get("snippet") or "").strip()
 
583
  continue
584
  seen_domains.add(dom)
585
 
586
+ if output_format == "readable":
587
+ results.append({
588
+ "position": len(results) + 1,
589
+ "title": title or _domain_of(url),
590
+ "url": url,
591
+ "snippet": _shorten(body, max_snippet_chars) if include_snippets and body else ""
592
+ })
593
+ else:
594
+ obj = {"t": title or _domain_of(url), "u": url}
595
+ if include_snippets and body:
596
+ obj["s"] = _shorten(body, max_snippet_chars)
597
+ results.append(json.dumps(obj, ensure_ascii=False, separators=(",", ":")))
598
+
599
+ if output_format == "readable":
600
+ if not results:
601
+ return f"No results found for query: {query}"
602
+
603
+ output = [f"Found {len(results)} search results for: {query}\n"]
604
+ for result in results:
605
+ output.append(f"{result['position']}. {result['title']}")
606
+ output.append(f" URL: {result['url']}")
607
+ if result['snippet']:
608
+ output.append(f" Summary: {result['snippet']}")
609
+ output.append("") # Empty line between results
610
+
611
+ return "\n".join(output).rstrip()
612
+ else:
613
+ # Return JSONL format (original behavior)
614
+ return "\n".join(results)
615
 
616
 
617
  # ======================================
 
890
  "'Full-page Markdown (Content Scraper mode)' option to return the page "
891
  "converted to Markdown."
892
  ),
893
+ flagging_mode="never",
894
  )
895
 
896
  # --- Concise DDG tab (JSONL with short keys, minimal tokens) ---
 
903
  gr.Slider(minimum=20, maximum=200, value=80, step=5, label="Max snippet chars"),
904
  gr.Checkbox(value=True, label="Dedupe by domain"),
905
  gr.Slider(minimum=20, maximum=120, value=80, step=5, label="Max title chars"),
906
+ gr.Dropdown(label="Output Format", choices=["jsonl", "readable"], value="jsonl", info="JSONL for compact output, Readable for LLM-friendly format"),
907
  ],
908
+ outputs=gr.Textbox(label="Search Results", interactive=False),
909
  title="DuckDuckGo Search",
910
  description=(
911
+ "<div style=\"text-align:center\">Enhanced web search with improved error handling and LLM-friendly output options. Choose JSONL for compact results or Readable format for better LLM processing.</div>"
912
  ),
913
  api_description=(
914
+ "Run a DuckDuckGo search with enhanced error handling and multiple output formats. "
915
+ "Returns either compact JSONL (t=title, u=url, optional s=snippet) or "
916
+ "readable text format optimized for LLM consumption with better error messages."
917
  ),
918
+ flagging_mode="never",
919
  submit_btn="Search",
920
  )
921
 
 
937
  "Returns:\n"
938
  "- string: Combined stdout produced by the code, or the exception text if execution failed."
939
  ),
940
+ flagging_mode="never",
941
  )
942
 
943
  CSS_STYLES = """
 
1002
  "Can generate audio of unlimited length by processing all text segments. "
1003
  "Return the generated media to the user in this format `![Alt text](URL)`"
1004
  ),
1005
+ flagging_mode="never",
1006
  )
1007
 
1008
  # ==========================
 
1128
  "sampler (str, label only), seed (int, -1=random), width/height (int, 64–1216). Returns a PIL.Image. "
1129
  "Return the generated media to the user in this format `![Alt text](URL)`"
1130
  ),
1131
+ flagging_mode="never",
1132
  )
1133
 
1134
  # ==========================
 
1305
  "Parameters: prompt (str), model_id (str), negative_prompt (str), steps (int), cfg_scale (float), seed (int), "
1306
  "width/height (int), fps (int), duration (float). Return the generated media to the user in this format `![Alt text](URL)`"
1307
  ),
1308
+ flagging_mode="never",
1309
  )
1310
 
1311
  # Build tabbed app; disable Image/Video tools if no HF token is present