'''Tools for GAIA question answering agent.'''
import time
import logging
import requests
from smolagents import tool
from googlesearch import search
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
from functions.tool_helper_functions import (
libretext_book_parser,
libretext_chapter_parser,
save_libretext_book_as_markdown,
WikipediaFetcher
)
# Get logger for this module
logger = logging.getLogger(__name__)
@tool
def google_search(query: str) -> dict:
"""
Perform a Google search and return the top 10 results.
Args:
query (str): The search query.
Returns:
dict: A dictionary containing the search results in the following format.
{0: {'title': str, 'url': str, 'description': str}, ...}
"""
# Run the query
results = list(search(query, num_results=10, advanced=True))
# Parse and format the results
parsed_results = {}
for i, result in enumerate(results):
parsed_results[i] = {
'title': result.title,
'url': result.url,
'description': result.description
}
return parsed_results
@tool
def wikipedia_search(query: str) -> dict:
"""
Perform a search for wikipedia pages and return the top 5 results.
Args:
query (str): The search query.
Returns:
dict: A dictionary containing the search results in the following format.
{0: {'title': str, 'description': str}, ...}
"""
repo_url = 'https://github.com/gperdrizet/unit-four-final-project'
language_code = 'en'
number_of_results = 5
headers = {
'User-Agent': f'HuggingFace Agents course final project ({repo_url})'
}
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
endpoint = '/search/page'
url = base_url + language_code + endpoint
parameters = {'q': query, 'limit': number_of_results}
response = requests.get(url, headers=headers, params=parameters, timeout=15)
if response.status_code == 200:
results = response.json().get('pages', [])
parsed_results = {}
else:
return f"Error: Unable to retrieve page. Status code {response.status_code}"
for i, result in enumerate(results):
parsed_results[i] = {
'title': result.get('title', None),
'description': result.get('description', None)
}
return parsed_results
@tool
def get_wikipedia_page(query: str) -> str:
"""
Get the content of a Wikipedia page as HTML. Use this tool when trying to
retrieve information from a Wikipedia page or article.
Args:
query (str): The title of the Wikipedia page.
Returns:
str: The HTML content of the Wikipedia page.
"""
fetcher = WikipediaFetcher()
html_result = fetcher.fetch(query.replace(' ', '_'))
content = html_result['content']
content = content.split(
'
Further reading
'
)[0]
content = content.split(
'References
'
)[0]
return content
@tool
def libretext_book_search(query: str) -> dict:
"""
Search for LibreTexts books using Selenium to handle JavaScript-rendered content.
Args:
query (str): The search query.
Returns:
dict: A dictionary containing the search results in the following format.
{0: {'title': str, 'url': str, 'description': str}, ...}
"""
# Configure Chrome options for headless mode
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
driver = None
try:
# Initialize the Chrome driver
driver = webdriver.Chrome(options=chrome_options)
# Construct search URL
search_url = 'https://chem.libretexts.org/Special:Search'
params = {
'qid': '',
'fpid': '230',
'fpth': '',
'query': query
}
# Build URL with parameters
param_string = '&'.join([f"{k}={v}" for k, v in params.items()])
full_url = f"{search_url}?{param_string}"
logger.info('Selenium search URL: %s', full_url)
# Navigate to the search page
driver.get(full_url)
# Wait for the search results to load
# Wait for either search results or an indication that search is complete
wait = WebDriverWait(driver, 15)
try:
# Wait for the search results container to be present and have content
# or for a specific search result element to appear
_ = wait.until(
EC.presence_of_element_located((By.ID, "mt-search-spblls"))
)
# Give additional time for JavaScript to populate results
time.sleep(3)
# Get the page source after JavaScript execution
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
# Look for search results using multiple possible selectors
search_info_divs = soup.find_all('div', class_='mt-search-information')
# If no results with that class, try other common search result patterns
if not search_info_divs:
# Try alternative selectors that might be used for search results
search_info_divs = soup.find_all('div', class_='search-result')
if not search_info_divs:
search_info_divs = soup.find_all('div', class_='result')
if not search_info_divs:
# Look for any divs within the search results container
results_container = soup.find('div', id='mt-search-spblls')
if results_container:
search_info_divs = results_container.find_all('div', recursive=False)
logger.info('Found %d potential search result divs', len(search_info_divs))
# Parse the search results
parsed_results = {}
result_count = 0
for div in search_info_divs:
# Try to extract title and URL from various possible structures
title = None
url = None
summary = None
# Look for title in anchor tags
title_link = div.find('a')
if title_link:
title = title_link.get_text(strip=True)
url = title_link.get('href', '')
# Make URL absolute if it's relative
if url and url.startswith('/'):
url = 'https://chem.libretexts.org' + url
# Look for description/summary text
# Try multiple approaches to find descriptive text
text_elements = div.find_all(['p', 'span', 'div'])
for element in text_elements:
text = element.get_text(strip=True)
if text and len(text) > 20 and not title or text != title:
summary = text
break
# Only add to results if we have at least a title
if title and len(title) > 3: # Ensure title is meaningful
parsed_results[result_count] = {
'title': title,
'url': url or '',
'description': summary or ''
}
logger.debug(
'Extracted result %d: title="%s", url="%s"',
result_count,
title,
url
)
result_count += 1
logger.info('Successfully extracted %d search results', len(parsed_results))
return parsed_results
except TimeoutException:
logger.error('Timeout waiting for search results to load')
return {'error': 'Timeout waiting for search results to load'}
except WebDriverException as e:
logger.error('WebDriver error: %s', str(e))
return {'error': f'WebDriver error: {str(e)}'}
except Exception as e: # pylint:disable=broad-exception-caught
logger.error('Unexpected error in Selenium search: %s', str(e))
return {'error': f'Unexpected error: {str(e)}'}
finally:
# Always clean up the driver
if driver:
try:
driver.quit()
except Exception as e: # pylint:disable=broad-exception-caught
logger.warning('Error closing driver: %s', str(e))
@tool
def get_libretext_book(url: str) -> dict:
"""
Get the complete content of a LibreTexts book including all chapters and sections.
Args:
url (str): The URL of the LibreTexts book page.
Returns:
dict: A dictionary containing the complete book structure in the following format.
{
'title': 'book title string',
'chapters': {
'Chapter title': {
'sections': {
'Section title': {
'Section summary': 'Section summary string',
'Section url': 'https://example.com/section-url',
},
...
}
},
...
}
}
"""
logger.info('Getting complete LibreTexts book: %s', url)
# First, get the book structure (chapters)
book_data = libretext_book_parser(url)
if 'error' in book_data:
logger.error('Failed to parse book structure: %s', book_data['error'])
return book_data
# Extract book title from URL or use a default
book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ')
if '(' in book_title:
book_title = book_title.split('(')[0].strip()
# Initialize the complete book structure
complete_book = {
'title': book_title,
'chapters': {}
}
logger.info('Found %d chapters to process', len(book_data))
# Process each chapter
for chapter_info in book_data.values():
chapter_title = chapter_info['title']
chapter_url = chapter_info['url']
logger.info('Processing chapter: %s', chapter_title)
# Get sections for this chapter
sections_data = libretext_chapter_parser(chapter_url)
# Initialize chapter structure
complete_book['chapters'][chapter_title] = {
'sections': {}
}
if 'error' in sections_data:
logger.warning('Failed to parse sections for chapter "%s": %s',
chapter_title, sections_data['error'])
complete_book['chapters'][chapter_title]['sections']['Error'] = {
'Section summary': f"Failed to parse sections: {sections_data['error']}",
'Section url': chapter_url
}
else:
# Process each section
for section_info in sections_data.values():
section_title = section_info['title']
section_url = section_info['url']
section_description = section_info['description']
complete_book['chapters'][chapter_title]['sections'][section_title] = {
'Section summary': section_description,
'Section url': section_url
}
logger.debug('Added section: %s', section_title)
logger.info('Successfully processed %d sections for chapter "%s"',
len(sections_data), chapter_title)
logger.info('Successfully compiled complete book with %d chapters',
len(complete_book['chapters']))
save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url)
return complete_book