'''Tools for GAIA question answering agent.''' import time import logging import requests from smolagents import tool from googlesearch import search from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException, WebDriverException from functions.tool_helper_functions import ( libretext_book_parser, libretext_chapter_parser, save_libretext_book_as_markdown, WikipediaFetcher ) # Get logger for this module logger = logging.getLogger(__name__) @tool def google_search(query: str) -> dict: """ Perform a Google search and return the top 10 results. Args: query (str): The search query. Returns: dict: A dictionary containing the search results in the following format. {0: {'title': str, 'url': str, 'description': str}, ...} """ # Run the query results = list(search(query, num_results=10, advanced=True)) # Parse and format the results parsed_results = {} for i, result in enumerate(results): parsed_results[i] = { 'title': result.title, 'url': result.url, 'description': result.description } return parsed_results @tool def wikipedia_search(query: str) -> dict: """ Perform a search for wikipedia pages and return the top 5 results. Args: query (str): The search query. Returns: dict: A dictionary containing the search results in the following format. {0: {'title': str, 'description': str}, ...} """ repo_url = 'https://github.com/gperdrizet/unit-four-final-project' language_code = 'en' number_of_results = 5 headers = { 'User-Agent': f'HuggingFace Agents course final project ({repo_url})' } base_url = 'https://api.wikimedia.org/core/v1/wikipedia/' endpoint = '/search/page' url = base_url + language_code + endpoint parameters = {'q': query, 'limit': number_of_results} response = requests.get(url, headers=headers, params=parameters, timeout=15) if response.status_code == 200: results = response.json().get('pages', []) parsed_results = {} else: return f"Error: Unable to retrieve page. Status code {response.status_code}" for i, result in enumerate(results): parsed_results[i] = { 'title': result.get('title', None), 'description': result.get('description', None) } return parsed_results @tool def get_wikipedia_page(query: str) -> str: """ Get the content of a Wikipedia page as HTML. Use this tool when trying to retrieve information from a Wikipedia page or article. Args: query (str): The title of the Wikipedia page. Returns: str: The HTML content of the Wikipedia page. """ fetcher = WikipediaFetcher() html_result = fetcher.fetch(query.replace(' ', '_')) content = html_result['content'] content = content.split( '

Further reading

' )[0] content = content.split( '

References

' )[0] return content @tool def libretext_book_search(query: str) -> dict: """ Search for LibreTexts books using Selenium to handle JavaScript-rendered content. Args: query (str): The search query. Returns: dict: A dictionary containing the search results in the following format. {0: {'title': str, 'url': str, 'description': str}, ...} """ # Configure Chrome options for headless mode chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--window-size=1920,1080") chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") driver = None try: # Initialize the Chrome driver driver = webdriver.Chrome(options=chrome_options) # Construct search URL search_url = 'https://chem.libretexts.org/Special:Search' params = { 'qid': '', 'fpid': '230', 'fpth': '', 'query': query } # Build URL with parameters param_string = '&'.join([f"{k}={v}" for k, v in params.items()]) full_url = f"{search_url}?{param_string}" logger.info('Selenium search URL: %s', full_url) # Navigate to the search page driver.get(full_url) # Wait for the search results to load # Wait for either search results or an indication that search is complete wait = WebDriverWait(driver, 15) try: # Wait for the search results container to be present and have content # or for a specific search result element to appear _ = wait.until( EC.presence_of_element_located((By.ID, "mt-search-spblls")) ) # Give additional time for JavaScript to populate results time.sleep(3) # Get the page source after JavaScript execution page_source = driver.page_source soup = BeautifulSoup(page_source, 'html.parser') # Look for search results using multiple possible selectors search_info_divs = soup.find_all('div', class_='mt-search-information') # If no results with that class, try other common search result patterns if not search_info_divs: # Try alternative selectors that might be used for search results search_info_divs = soup.find_all('div', class_='search-result') if not search_info_divs: search_info_divs = soup.find_all('div', class_='result') if not search_info_divs: # Look for any divs within the search results container results_container = soup.find('div', id='mt-search-spblls') if results_container: search_info_divs = results_container.find_all('div', recursive=False) logger.info('Found %d potential search result divs', len(search_info_divs)) # Parse the search results parsed_results = {} result_count = 0 for div in search_info_divs: # Try to extract title and URL from various possible structures title = None url = None summary = None # Look for title in anchor tags title_link = div.find('a') if title_link: title = title_link.get_text(strip=True) url = title_link.get('href', '') # Make URL absolute if it's relative if url and url.startswith('/'): url = 'https://chem.libretexts.org' + url # Look for description/summary text # Try multiple approaches to find descriptive text text_elements = div.find_all(['p', 'span', 'div']) for element in text_elements: text = element.get_text(strip=True) if text and len(text) > 20 and not title or text != title: summary = text break # Only add to results if we have at least a title if title and len(title) > 3: # Ensure title is meaningful parsed_results[result_count] = { 'title': title, 'url': url or '', 'description': summary or '' } logger.debug( 'Extracted result %d: title="%s", url="%s"', result_count, title, url ) result_count += 1 logger.info('Successfully extracted %d search results', len(parsed_results)) return parsed_results except TimeoutException: logger.error('Timeout waiting for search results to load') return {'error': 'Timeout waiting for search results to load'} except WebDriverException as e: logger.error('WebDriver error: %s', str(e)) return {'error': f'WebDriver error: {str(e)}'} except Exception as e: # pylint:disable=broad-exception-caught logger.error('Unexpected error in Selenium search: %s', str(e)) return {'error': f'Unexpected error: {str(e)}'} finally: # Always clean up the driver if driver: try: driver.quit() except Exception as e: # pylint:disable=broad-exception-caught logger.warning('Error closing driver: %s', str(e)) @tool def get_libretext_book(url: str) -> dict: """ Get the complete content of a LibreTexts book including all chapters and sections. Args: url (str): The URL of the LibreTexts book page. Returns: dict: A dictionary containing the complete book structure in the following format. { 'title': 'book title string', 'chapters': { 'Chapter title': { 'sections': { 'Section title': { 'Section summary': 'Section summary string', 'Section url': 'https://example.com/section-url', }, ... } }, ... } } """ logger.info('Getting complete LibreTexts book: %s', url) # First, get the book structure (chapters) book_data = libretext_book_parser(url) if 'error' in book_data: logger.error('Failed to parse book structure: %s', book_data['error']) return book_data # Extract book title from URL or use a default book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ') if '(' in book_title: book_title = book_title.split('(')[0].strip() # Initialize the complete book structure complete_book = { 'title': book_title, 'chapters': {} } logger.info('Found %d chapters to process', len(book_data)) # Process each chapter for chapter_info in book_data.values(): chapter_title = chapter_info['title'] chapter_url = chapter_info['url'] logger.info('Processing chapter: %s', chapter_title) # Get sections for this chapter sections_data = libretext_chapter_parser(chapter_url) # Initialize chapter structure complete_book['chapters'][chapter_title] = { 'sections': {} } if 'error' in sections_data: logger.warning('Failed to parse sections for chapter "%s": %s', chapter_title, sections_data['error']) complete_book['chapters'][chapter_title]['sections']['Error'] = { 'Section summary': f"Failed to parse sections: {sections_data['error']}", 'Section url': chapter_url } else: # Process each section for section_info in sections_data.values(): section_title = section_info['title'] section_url = section_info['url'] section_description = section_info['description'] complete_book['chapters'][chapter_title]['sections'][section_title] = { 'Section summary': section_description, 'Section url': section_url } logger.debug('Added section: %s', section_title) logger.info('Successfully processed %d sections for chapter "%s"', len(sections_data), chapter_title) logger.info('Successfully compiled complete book with %d chapters', len(complete_book['chapters'])) save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url) return complete_book