Spaces:
Sleeping
Sleeping

Switched to single agent powered by GPT-4.1, added step wait function to avoid hitting the OpenAI API rate limit.
b4e2809
verified
'''Tools for GAIA question answering agent.''' | |
import time | |
import logging | |
import requests | |
from smolagents import tool | |
from googlesearch import search | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.chrome.options import Options | |
from selenium.common.exceptions import TimeoutException, WebDriverException | |
from functions.tool_helper_functions import ( | |
libretext_book_parser, | |
libretext_chapter_parser, | |
save_libretext_book_as_markdown, | |
WikipediaFetcher | |
) | |
# Get logger for this module | |
logger = logging.getLogger(__name__) | |
def google_search(query: str) -> dict: | |
""" | |
Perform a Google search and return the top 10 results. | |
Args: | |
query (str): The search query. | |
Returns: | |
dict: A dictionary containing the search results in the following format. | |
{0: {'title': str, 'url': str, 'description': str}, ...} | |
""" | |
# Run the query | |
results = list(search(query, num_results=10, advanced=True)) | |
# Parse and format the results | |
parsed_results = {} | |
for i, result in enumerate(results): | |
parsed_results[i] = { | |
'title': result.title, | |
'url': result.url, | |
'description': result.description | |
} | |
return parsed_results | |
def wikipedia_search(query: str) -> dict: | |
""" | |
Perform a search for wikipedia pages and return the top 5 results. | |
Args: | |
query (str): The search query. | |
Returns: | |
dict: A dictionary containing the search results in the following format. | |
{0: {'title': str, 'description': str}, ...} | |
""" | |
repo_url = 'https://github.com/gperdrizet/unit-four-final-project' | |
language_code = 'en' | |
number_of_results = 5 | |
headers = { | |
'User-Agent': f'HuggingFace Agents course final project ({repo_url})' | |
} | |
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/' | |
endpoint = '/search/page' | |
url = base_url + language_code + endpoint | |
parameters = {'q': query, 'limit': number_of_results} | |
response = requests.get(url, headers=headers, params=parameters, timeout=15) | |
if response.status_code == 200: | |
results = response.json().get('pages', []) | |
parsed_results = {} | |
else: | |
return f"Error: Unable to retrieve page. Status code {response.status_code}" | |
for i, result in enumerate(results): | |
parsed_results[i] = { | |
'title': result.get('title', None), | |
'description': result.get('description', None) | |
} | |
return parsed_results | |
def get_wikipedia_page(query: str) -> str: | |
""" | |
Get the content of a Wikipedia page as HTML. Use this tool when trying to | |
retrieve information from a Wikipedia page or article. | |
Args: | |
query (str): The title of the Wikipedia page. | |
Returns: | |
str: The HTML content of the Wikipedia page. | |
""" | |
fetcher = WikipediaFetcher() | |
html_result = fetcher.fetch(query.replace(' ', '_')) | |
content = html_result['content'] | |
content = content.split( | |
'<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>' | |
)[0] | |
content = content.split( | |
'<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>' | |
)[0] | |
return content | |
def libretext_book_search(query: str) -> dict: | |
""" | |
Search for LibreTexts books using Selenium to handle JavaScript-rendered content. | |
Args: | |
query (str): The search query. | |
Returns: | |
dict: A dictionary containing the search results in the following format. | |
{0: {'title': str, 'url': str, 'description': str}, ...} | |
""" | |
# Configure Chrome options for headless mode | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--window-size=1920,1080") | |
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + | |
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
driver = None | |
try: | |
# Initialize the Chrome driver | |
driver = webdriver.Chrome(options=chrome_options) | |
# Construct search URL | |
search_url = 'https://chem.libretexts.org/Special:Search' | |
params = { | |
'qid': '', | |
'fpid': '230', | |
'fpth': '', | |
'query': query | |
} | |
# Build URL with parameters | |
param_string = '&'.join([f"{k}={v}" for k, v in params.items()]) | |
full_url = f"{search_url}?{param_string}" | |
logger.info('Selenium search URL: %s', full_url) | |
# Navigate to the search page | |
driver.get(full_url) | |
# Wait for the search results to load | |
# Wait for either search results or an indication that search is complete | |
wait = WebDriverWait(driver, 15) | |
try: | |
# Wait for the search results container to be present and have content | |
# or for a specific search result element to appear | |
_ = wait.until( | |
EC.presence_of_element_located((By.ID, "mt-search-spblls")) | |
) | |
# Give additional time for JavaScript to populate results | |
time.sleep(3) | |
# Get the page source after JavaScript execution | |
page_source = driver.page_source | |
soup = BeautifulSoup(page_source, 'html.parser') | |
# Look for search results using multiple possible selectors | |
search_info_divs = soup.find_all('div', class_='mt-search-information') | |
# If no results with that class, try other common search result patterns | |
if not search_info_divs: | |
# Try alternative selectors that might be used for search results | |
search_info_divs = soup.find_all('div', class_='search-result') | |
if not search_info_divs: | |
search_info_divs = soup.find_all('div', class_='result') | |
if not search_info_divs: | |
# Look for any divs within the search results container | |
results_container = soup.find('div', id='mt-search-spblls') | |
if results_container: | |
search_info_divs = results_container.find_all('div', recursive=False) | |
logger.info('Found %d potential search result divs', len(search_info_divs)) | |
# Parse the search results | |
parsed_results = {} | |
result_count = 0 | |
for div in search_info_divs: | |
# Try to extract title and URL from various possible structures | |
title = None | |
url = None | |
summary = None | |
# Look for title in anchor tags | |
title_link = div.find('a') | |
if title_link: | |
title = title_link.get_text(strip=True) | |
url = title_link.get('href', '') | |
# Make URL absolute if it's relative | |
if url and url.startswith('/'): | |
url = 'https://chem.libretexts.org' + url | |
# Look for description/summary text | |
# Try multiple approaches to find descriptive text | |
text_elements = div.find_all(['p', 'span', 'div']) | |
for element in text_elements: | |
text = element.get_text(strip=True) | |
if text and len(text) > 20 and not title or text != title: | |
summary = text | |
break | |
# Only add to results if we have at least a title | |
if title and len(title) > 3: # Ensure title is meaningful | |
parsed_results[result_count] = { | |
'title': title, | |
'url': url or '', | |
'description': summary or '' | |
} | |
logger.debug( | |
'Extracted result %d: title="%s", url="%s"', | |
result_count, | |
title, | |
url | |
) | |
result_count += 1 | |
logger.info('Successfully extracted %d search results', len(parsed_results)) | |
return parsed_results | |
except TimeoutException: | |
logger.error('Timeout waiting for search results to load') | |
return {'error': 'Timeout waiting for search results to load'} | |
except WebDriverException as e: | |
logger.error('WebDriver error: %s', str(e)) | |
return {'error': f'WebDriver error: {str(e)}'} | |
except Exception as e: # pylint:disable=broad-exception-caught | |
logger.error('Unexpected error in Selenium search: %s', str(e)) | |
return {'error': f'Unexpected error: {str(e)}'} | |
finally: | |
# Always clean up the driver | |
if driver: | |
try: | |
driver.quit() | |
except Exception as e: # pylint:disable=broad-exception-caught | |
logger.warning('Error closing driver: %s', str(e)) | |
def get_libretext_book(url: str) -> dict: | |
""" | |
Get the complete content of a LibreTexts book including all chapters and sections. | |
Args: | |
url (str): The URL of the LibreTexts book page. | |
Returns: | |
dict: A dictionary containing the complete book structure in the following format. | |
{ | |
'title': 'book title string', | |
'chapters': { | |
'Chapter title': { | |
'sections': { | |
'Section title': { | |
'Section summary': 'Section summary string', | |
'Section url': 'https://example.com/section-url', | |
}, | |
... | |
} | |
}, | |
... | |
} | |
} | |
""" | |
logger.info('Getting complete LibreTexts book: %s', url) | |
# First, get the book structure (chapters) | |
book_data = libretext_book_parser(url) | |
if 'error' in book_data: | |
logger.error('Failed to parse book structure: %s', book_data['error']) | |
return book_data | |
# Extract book title from URL or use a default | |
book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ') | |
if '(' in book_title: | |
book_title = book_title.split('(')[0].strip() | |
# Initialize the complete book structure | |
complete_book = { | |
'title': book_title, | |
'chapters': {} | |
} | |
logger.info('Found %d chapters to process', len(book_data)) | |
# Process each chapter | |
for chapter_info in book_data.values(): | |
chapter_title = chapter_info['title'] | |
chapter_url = chapter_info['url'] | |
logger.info('Processing chapter: %s', chapter_title) | |
# Get sections for this chapter | |
sections_data = libretext_chapter_parser(chapter_url) | |
# Initialize chapter structure | |
complete_book['chapters'][chapter_title] = { | |
'sections': {} | |
} | |
if 'error' in sections_data: | |
logger.warning('Failed to parse sections for chapter "%s": %s', | |
chapter_title, sections_data['error']) | |
complete_book['chapters'][chapter_title]['sections']['Error'] = { | |
'Section summary': f"Failed to parse sections: {sections_data['error']}", | |
'Section url': chapter_url | |
} | |
else: | |
# Process each section | |
for section_info in sections_data.values(): | |
section_title = section_info['title'] | |
section_url = section_info['url'] | |
section_description = section_info['description'] | |
complete_book['chapters'][chapter_title]['sections'][section_title] = { | |
'Section summary': section_description, | |
'Section url': section_url | |
} | |
logger.debug('Added section: %s', section_title) | |
logger.info('Successfully processed %d sections for chapter "%s"', | |
len(sections_data), chapter_title) | |
logger.info('Successfully compiled complete book with %d chapters', | |
len(complete_book['chapters'])) | |
save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url) | |
return complete_book | |