hehe

Sleeping

hehe / application /utils /web_search.py

Update application/utils/web_search.py

e3fb49b verified 8 months ago

1.63 kB

	# application/utils/web_search.py
	from duckduckgo_search import DDGS # Simpler import
	from bs4 import BeautifulSoup
	import requests
	import re

	class WebScarper:
	def __init__(self):
	self.ddgs = DDGS()

	def get_urls(self, query):
	results = self.ddgs.text(query, max_results=3)
	return [result['href'] for result in results] if results else []

	def fetch_url(self, url):
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
	'Accept-Language': 'en-US,en;q=0.9',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8',
	}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
	return response.text
	except requests.exceptions.RequestException as e:
	print(f"Error fetching URL {url}: {e}")
	return None
	def get_text(self, data):
	soup = BeautifulSoup(data, 'html.parser')
	text = soup.get_text()
	cleaned_text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
	return cleaned_text[:4000] if len(cleaned_text) > 4000 else cleaned_text

	def scarpe(self, query):
	urls = self.get_urls(query)
	if not urls:
	return None

	for url in urls:
	data = self.fetch_url(url)
	if data:
	return self.get_text(data)
	return None