Spaces:
Paused
Paused
Update webscout.py
Browse files- webscout.py +171 -0
webscout.py
CHANGED
|
@@ -1811,3 +1811,174 @@ def fastai(user, model="llama3-70b", system="Answer as concisely as possible."):
|
|
| 1811 |
return output
|
| 1812 |
|
| 1813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1811 |
return output
|
| 1812 |
|
| 1813 |
|
| 1814 |
+
from bs4 import BeautifulSoup
|
| 1815 |
+
import requests
|
| 1816 |
+
from typing import Dict, List, Optional, Union
|
| 1817 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 1818 |
+
from urllib.parse import quote
|
| 1819 |
+
from termcolor import colored
|
| 1820 |
+
import time
|
| 1821 |
+
import random
|
| 1822 |
+
|
| 1823 |
+
class GoogleS:
|
| 1824 |
+
"""
|
| 1825 |
+
Class to perform Google searches and retrieve results.
|
| 1826 |
+
"""
|
| 1827 |
+
|
| 1828 |
+
def __init__(
|
| 1829 |
+
self,
|
| 1830 |
+
headers: Optional[Dict[str, str]] = None,
|
| 1831 |
+
proxy: Optional[str] = None,
|
| 1832 |
+
timeout: Optional[int] = 10,
|
| 1833 |
+
max_workers: int = 20 # Increased max workers for thread pool
|
| 1834 |
+
):
|
| 1835 |
+
"""Initializes the GoogleS object."""
|
| 1836 |
+
self.proxy = proxy
|
| 1837 |
+
self.headers = headers if headers else {
|
| 1838 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
|
| 1839 |
+
}
|
| 1840 |
+
self.headers["Referer"] = "https://www.google.com/"
|
| 1841 |
+
self.client = requests.Session()
|
| 1842 |
+
self.client.headers.update(self.headers)
|
| 1843 |
+
self.client.proxies.update({"http": self.proxy, "https": self.proxy})
|
| 1844 |
+
self.timeout = timeout
|
| 1845 |
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
| 1846 |
+
|
| 1847 |
+
def __enter__(self):
|
| 1848 |
+
return self
|
| 1849 |
+
|
| 1850 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 1851 |
+
self.client.close()
|
| 1852 |
+
|
| 1853 |
+
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
| 1854 |
+
data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
|
| 1855 |
+
"""
|
| 1856 |
+
Makes an HTTP request and returns the response content.
|
| 1857 |
+
"""
|
| 1858 |
+
try:
|
| 1859 |
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
| 1860 |
+
except Exception as ex:
|
| 1861 |
+
raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
|
| 1862 |
+
if resp.status_code == 200:
|
| 1863 |
+
return resp.content
|
| 1864 |
+
raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
|
| 1865 |
+
|
| 1866 |
+
def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
|
| 1867 |
+
"""
|
| 1868 |
+
Extracts visible text from HTML content using lxml parser.
|
| 1869 |
+
"""
|
| 1870 |
+
soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
|
| 1871 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
| 1872 |
+
tag.extract()
|
| 1873 |
+
visible_text = soup.get_text(strip=True)
|
| 1874 |
+
if max_characters:
|
| 1875 |
+
visible_text = visible_text[:max_characters]
|
| 1876 |
+
return visible_text
|
| 1877 |
+
|
| 1878 |
+
def search(
|
| 1879 |
+
self,
|
| 1880 |
+
query: str,
|
| 1881 |
+
region: str = "us-en",
|
| 1882 |
+
language: str = "en",
|
| 1883 |
+
safe: str = "off",
|
| 1884 |
+
time_period: Optional[str] = None,
|
| 1885 |
+
max_results: int = 10,
|
| 1886 |
+
extract_text: bool = False,
|
| 1887 |
+
max_text_length: Optional[int] = 100,
|
| 1888 |
+
) -> List[Dict[str, Union[str, int]]]:
|
| 1889 |
+
"""
|
| 1890 |
+
Performs a Google search and returns the results.
|
| 1891 |
+
|
| 1892 |
+
Args:
|
| 1893 |
+
query (str): The search query.
|
| 1894 |
+
region (str, optional): The region to search in (e.g., "us-en"). Defaults to "us-en".
|
| 1895 |
+
language (str, optional): The language of the search results (e.g., "en"). Defaults to "en".
|
| 1896 |
+
safe (str, optional): Safe search setting ("off", "active"). Defaults to "off".
|
| 1897 |
+
time_period (Optional[str], optional): Time period filter (e.g., "h" for past hour, "d" for past day).
|
| 1898 |
+
Defaults to None.
|
| 1899 |
+
max_results (int, optional): The maximum number of results to retrieve. Defaults to 10.
|
| 1900 |
+
extract_text (bool, optional): Whether to extract text from the linked web pages. Defaults to False.
|
| 1901 |
+
max_text_length (Optional[int], optional): The maximum length of the extracted text (in characters).
|
| 1902 |
+
Defaults to 100.
|
| 1903 |
+
|
| 1904 |
+
Returns:
|
| 1905 |
+
List[Dict[str, Union[str, int]]]: A list of dictionaries, each representing a search result, containing:
|
| 1906 |
+
- 'title': The title of the result.
|
| 1907 |
+
- 'href': The URL of the result.
|
| 1908 |
+
- 'abstract': The description snippet of the result.
|
| 1909 |
+
- 'index': The index of the result in the list.
|
| 1910 |
+
- 'type': The type of result (currently always "web").
|
| 1911 |
+
- 'visible_text': The extracted text from the web page (if `extract_text` is True).
|
| 1912 |
+
"""
|
| 1913 |
+
assert query, "Query cannot be empty."
|
| 1914 |
+
|
| 1915 |
+
results = []
|
| 1916 |
+
futures = []
|
| 1917 |
+
start = 0
|
| 1918 |
+
|
| 1919 |
+
while len(results) < max_results:
|
| 1920 |
+
params = {
|
| 1921 |
+
"q": query,
|
| 1922 |
+
"num": 10,
|
| 1923 |
+
"hl": language,
|
| 1924 |
+
"start": start,
|
| 1925 |
+
"safe": safe,
|
| 1926 |
+
"gl": region,
|
| 1927 |
+
}
|
| 1928 |
+
if time_period:
|
| 1929 |
+
params["tbs"] = f"qdr:{time_period}"
|
| 1930 |
+
|
| 1931 |
+
futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
|
| 1932 |
+
start += 10
|
| 1933 |
+
|
| 1934 |
+
for future in as_completed(futures):
|
| 1935 |
+
try:
|
| 1936 |
+
resp_content = future.result()
|
| 1937 |
+
soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
|
| 1938 |
+
result_blocks = soup.find_all("div", class_="g")
|
| 1939 |
+
|
| 1940 |
+
if not result_blocks:
|
| 1941 |
+
break
|
| 1942 |
+
|
| 1943 |
+
# Extract links and titles first
|
| 1944 |
+
for result_block in result_blocks:
|
| 1945 |
+
link = result_block.find("a", href=True)
|
| 1946 |
+
title = result_block.find("h3")
|
| 1947 |
+
description_box = result_block.find(
|
| 1948 |
+
"div", {"style": "-webkit-line-clamp:2"}
|
| 1949 |
+
)
|
| 1950 |
+
|
| 1951 |
+
if link and title and description_box:
|
| 1952 |
+
url = link["href"]
|
| 1953 |
+
results.append({
|
| 1954 |
+
"title": title.text,
|
| 1955 |
+
"href": url,
|
| 1956 |
+
"abstract": description_box.text,
|
| 1957 |
+
"index": len(results),
|
| 1958 |
+
"type": "web",
|
| 1959 |
+
"visible_text": "" # Initialize visible_text as empty string
|
| 1960 |
+
})
|
| 1961 |
+
|
| 1962 |
+
if len(results) >= max_results:
|
| 1963 |
+
break # Stop if we have enough results
|
| 1964 |
+
|
| 1965 |
+
# Parallelize text extraction if needed
|
| 1966 |
+
if extract_text:
|
| 1967 |
+
with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
|
| 1968 |
+
extraction_futures = [
|
| 1969 |
+
text_extractor.submit(self._extract_text_from_webpage,
|
| 1970 |
+
self._get_url("GET", result['href']),
|
| 1971 |
+
max_characters=max_text_length)
|
| 1972 |
+
for result in results
|
| 1973 |
+
if 'href' in result
|
| 1974 |
+
]
|
| 1975 |
+
for i, future in enumerate(as_completed(extraction_futures)):
|
| 1976 |
+
try:
|
| 1977 |
+
results[i]['visible_text'] = future.result()
|
| 1978 |
+
except Exception as e:
|
| 1979 |
+
print(f"Error extracting text: {e}")
|
| 1980 |
+
|
| 1981 |
+
except Exception as e:
|
| 1982 |
+
print(f"Error: {e}")
|
| 1983 |
+
|
| 1984 |
+
return results
|