unit-four-final-project

Sleeping

App Files Files Community

gperdrizet commited on Jul 1

Commit

a5e5840

verified ·

1 Parent(s): 7e55583

Added custom selenium based web search functions to look for academic text books via LibreTexts.

Browse files

Files changed (5) hide show

app.py +5 -4
functions/agent.py +6 -2
functions/tools.py +194 -6
requirements.txt +1 -0
tests/test_tools.py +49 -3

app.py CHANGED Viewed

@@ -255,7 +255,6 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
@@ -270,14 +269,16 @@ if __name__ == "__main__":
     if space_id_startup: # Print repo URLs if SPACE_ID is found
         logger.info("✅ SPACE_ID found: %s", space_id_startup)
         logger.info("   Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
-        logger.info("   Repo Tree URL: https://huggingface.co/spaces/%s/tree/main", space_id_startup)
     else:
         logger.info(
             "ℹ️  SPACE_ID environment variable not found (running locally?). " \
             "Repo URL cannot be determined."
         )
-    logger.info("-" + "-"*(60 + len(" App Starting ")) + "\n")
     logger.info("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

     )
 if __name__ == "__main__":
     # Check for SPACE_HOST and SPACE_ID at startup for information
     space_host_startup = os.getenv("SPACE_HOST")
     if space_id_startup: # Print repo URLs if SPACE_ID is found
         logger.info("✅ SPACE_ID found: %s", space_id_startup)
         logger.info("   Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
+        logger.info(
+            "   Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
+              space_id_startup
+        )
     else:
         logger.info(
             "ℹ️  SPACE_ID environment variable not found (running locally?). " \
             "Repo URL cannot be determined."
         )
     logger.info("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

functions/agent.py CHANGED Viewed

@@ -82,7 +82,10 @@ def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
             new_messages = [agent.memory.steps[-1].model_input_messages[0]]
             new_messages.append({
                 'role': MessageRole.USER,
-                'content': [{'type': 'text', 'text': f'Here is a summary of your investigation so far: {summary}'}]
             })
             agent.memory.steps = [agent.memory.steps[0]]
             agent.memory.steps[0].model_input_messages = new_messages
@@ -107,7 +110,8 @@ def summarize_old_messages(messages: dict) -> dict:
     messages = [
         {
             'role': 'system',
-            'content': f'Summarize the following interaction between an AI agent and a user. Return the summary formatted as text, not as JSON: {json.dumps(messages)}'
         }
     ]

             new_messages = [agent.memory.steps[-1].model_input_messages[0]]
             new_messages.append({
                 'role': MessageRole.USER,
+                'content': [{
+                    'type': 'text',
+                    'text': f'Here is a summary of your investigation so far: {summary}'
+                }]
             })
             agent.memory.steps = [agent.memory.steps[0]]
             agent.memory.steps[0].model_input_messages = new_messages
     messages = [
         {
             'role': 'system',
+            'content': ('Summarize the following interaction between an AI agent and a user. ' +
+                f'Return the summary formatted as text, not as JSON: {json.dumps(messages)}')
         }
     ]

functions/tools.py CHANGED Viewed

@@ -1,5 +1,6 @@
 '''Tools for GAIA question answering agent.'''
 import logging
 import bleach
 import requests
@@ -7,6 +8,12 @@ from bleach.css_sanitizer import CSSSanitizer
 from smolagents import tool
 from googlesearch import search
 from bs4 import BeautifulSoup
 # Get logger for this module
 logger = logging.getLogger(__name__)
@@ -26,7 +33,7 @@ def google_search(query: str) -> dict:
     """
     # Run the query
-    results = list(search(query, num_results=5, advanced=True))
     # Parse and format the results
     parsed_results = {}
@@ -55,10 +62,12 @@ def wikipedia_search(query: str) -> dict:
         {0: {'title': str, 'description': str}, ...}
     """
     language_code = 'en'
     number_of_results = 5
     headers = {
-        'User-Agent': 'HuggingFace Agents course final project (https://github.com/gperdrizet/unit-four-final-project)'
     }
     base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
@@ -101,8 +110,14 @@ def get_wikipedia_page(query: str) -> str:
     html_result = fetcher.fetch(query.replace(' ', '_'))
     content = html_result['content']
-    content = content.split('<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>')[0]
-    content = content.split('<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>')[0]
     return content
@@ -345,10 +360,10 @@ class WikipediaFetcher:
         soup = BeautifulSoup(html, "lxml")
         for selector in selectors:
-            [tag.decompose() for tag in soup.select(selector)]
         for clss in classes:
-            [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
         for clss, new_classes in add_classes.items():
             for tag in soup.find_all(attrs={"class": clss}):
@@ -365,3 +380,176 @@ class WikipediaFetcher:
         html = "".join(str(tag) for tag in soup.contents)
         return html

 '''Tools for GAIA question answering agent.'''
+import time
 import logging
 import bleach
 import requests
 from smolagents import tool
 from googlesearch import search
 from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, WebDriverException
 # Get logger for this module
 logger = logging.getLogger(__name__)
     """
     # Run the query
+    results = list(search(query, num_results=10, advanced=True))
     # Parse and format the results
     parsed_results = {}
         {0: {'title': str, 'description': str}, ...}
     """
+    repo_url = 'https://github.com/gperdrizet/unit-four-final-project'
     language_code = 'en'
     number_of_results = 5
     headers = {
+        'User-Agent': f'HuggingFace Agents course final project ({repo_url})'
     }
     base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
     html_result = fetcher.fetch(query.replace(' ', '_'))
     content = html_result['content']
+    content = content.split(
+        '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
+    )[0]
+    content = content.split(
+        '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
+    )[0]
     return content
         soup = BeautifulSoup(html, "lxml")
         for selector in selectors:
+            _ = [tag.decompose() for tag in soup.select(selector)]
         for clss in classes:
+            _ = [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
         for clss, new_classes in add_classes.items():
             for tag in soup.find_all(attrs={"class": clss}):
         html = "".join(str(tag) for tag in soup.contents)
         return html
+@tool
+def libretext_book_parser(url: str) -> str:
+    """
+    Parse the content of a LibreTexts book and return table of contents as JSON.
+    Args:
+        url (str): The URL of the LibreTexts book page.
+    Returns:
+        dict: A dictionary containing the table of contents in JSON format.
+    """
+    logger.debug(url)
+    return "LibreTexts book parser is not yet implemented."
+@tool
+def libretext_book_search(query: str) -> dict:
+    """
+    Search for LibreTexts books using Selenium to handle JavaScript-rendered content.
+    Args:
+        query (str): The search query.
+    Returns:
+        dict: A dictionary containing the search results in the following format.
+        {0: {'title': str, 'url': str, 'description': str}, ...}
+    """
+    # Configure Chrome options for headless mode
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument(
+        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
+        "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    )
+    driver = None
+    try:
+        # Initialize the Chrome driver
+        driver = webdriver.Chrome(options=chrome_options)
+        # Construct search URL
+        search_url = 'https://chem.libretexts.org/Special:Search'
+        params = {
+            'qid': '',
+            'fpid': '230',
+            'fpth': '',
+            'query': query
+        }
+        # Build URL with parameters
+        param_string = '&'.join([f"{k}={v}" for k, v in params.items()])
+        full_url = f"{search_url}?{param_string}"
+        logger.info('Selenium search URL: %s', full_url)
+        # Navigate to the search page
+        driver.get(full_url)
+        # Wait for the search results to load
+        # Wait for either search results or an indication that search is complete
+        wait = WebDriverWait(driver, 15)
+        try:
+            # Wait for the search results container to be present and have content
+            # or for a specific search result element to appear
+            _ = wait.until(
+                EC.presence_of_element_located((By.ID, "mt-search-spblls"))
+            )
+            # Give additional time for JavaScript to populate results
+            time.sleep(3)
+            # Get the page source after JavaScript execution
+            page_source = driver.page_source
+            soup = BeautifulSoup(page_source, 'html.parser')
+            # Save the rendered HTML for debugging
+            with open('selenium_test.html', 'w', encoding='utf-8') as f:
+                f.write(soup.prettify())
+            # Look for search results using multiple possible selectors
+            search_info_divs = soup.find_all('div', class_='mt-search-information')
+            # If no results with that class, try other common search result patterns
+            if not search_info_divs:
+                # Try alternative selectors that might be used for search results
+                search_info_divs = soup.find_all('div', class_='search-result')
+                if not search_info_divs:
+                    search_info_divs = soup.find_all('div', class_='result')
+                if not search_info_divs:
+                    # Look for any divs within the search results container
+                    results_container = soup.find('div', id='mt-search-spblls')
+                    if results_container:
+                        search_info_divs = results_container.find_all('div', recursive=False)
+            logger.info('Found %d potential search result divs', len(search_info_divs))
+            # Parse the search results
+            parsed_results = {}
+            result_count = 0
+            for div in search_info_divs:
+                # Try to extract title and URL from various possible structures
+                title = None
+                url = None
+                summary = None
+                # Look for title in anchor tags
+                title_link = div.find('a')
+                if title_link:
+                    title = title_link.get_text(strip=True)
+                    url = title_link.get('href', '')
+                    # Make URL absolute if it's relative
+                    if url and url.startswith('/'):
+                        url = 'https://chem.libretexts.org' + url
+                # Look for description/summary text
+                # Try multiple approaches to find descriptive text
+                text_elements = div.find_all(['p', 'span', 'div'])
+                for element in text_elements:
+                    text = element.get_text(strip=True)
+                    if text and len(text) > 20 and not title or text != title:
+                        summary = text
+                        break
+                # Only add to results if we have at least a title
+                if title and len(title) > 3:  # Ensure title is meaningful
+                    parsed_results[result_count] = {
+                        'title': title,
+                        'url': url or '',
+                        'description': summary or ''
+                    }
+                    logger.debug(
+                        'Extracted result %d: title="%s", url="%s"',
+                        result_count,
+                        title,
+                        url
+                    )
+                    result_count += 1
+            logger.info('Successfully extracted %d search results', len(parsed_results))
+            return parsed_results
+        except TimeoutException:
+            logger.error('Timeout waiting for search results to load')
+            return {'error': 'Timeout waiting for search results to load'}
+    except WebDriverException as e:
+        logger.error('WebDriver error: %s', str(e))
+        return {'error': f'WebDriver error: {str(e)}'}
+    except Exception as e: # pylint: disable=broad-exception-caught
+        logger.error('Unexpected error in Selenium search: %s', str(e))
+        return {'error': f'Unexpected error: {str(e)}'}
+    finally:
+        # Always clean up the driver
+        if driver:
+            try:
+                driver.quit()
+            except Exception as e: # pylint: disable=broad-exception-caught
+                logger.warning('Error closing driver: %s', str(e))

requirements.txt CHANGED Viewed

@@ -5,6 +5,7 @@ gradio[oauth]
 markdownify
 mwparserfromhell
 requests
 smolagents==1.18.0
 tinycss2
 wikipedia-api

 markdownify
 mwparserfromhell
 requests
+selenium
 smolagents==1.18.0
 tinycss2
 wikipedia-api

tests/test_tools.py CHANGED Viewed

@@ -4,7 +4,8 @@ import unittest
 from functions.tools import (
     google_search,
     wikipedia_search,
-    get_wikipedia_page
 )
@@ -27,13 +28,13 @@ class TestGoogleSearch(unittest.TestCase):
     def test_result_length(self):
         '''Search results should contain 5 items.'''
-        self.assertEqual(len(self.search_results), 5)
     def test_result_content(self):
         '''Each search result should contain three elements: title, link, and snippet.'''
-        for _, result in self.search_results.items():
             self.assertIsInstance(result, dict)
             self.assertIn('title', result)
             self.assertIn('url', result)
@@ -95,3 +96,48 @@ class TestGetWikipediaPage(unittest.TestCase):
         '''Page content should not be empty.'''
         self.assertTrue(len(self.page_content) > 0)

 from functions.tools import (
     google_search,
     wikipedia_search,
+    get_wikipedia_page,
+    libretext_book_search
 )
     def test_result_length(self):
         '''Search results should contain 5 items.'''
+        self.assertEqual(len(self.search_results), 10)
     def test_result_content(self):
         '''Each search result should contain three elements: title, link, and snippet.'''
+        for result in self.search_results.values():
             self.assertIsInstance(result, dict)
             self.assertIn('title', result)
             self.assertIn('url', result)
         '''Page content should not be empty.'''
         self.assertTrue(len(self.page_content) > 0)
+class TestLibretextBookSearch(unittest.TestCase):
+    '''Tests for the libretext_book_search tool.'''
+    def setUp(self):
+        search_query = 'Introductory chemistry ck-12'
+        self.search_results = libretext_book_search(search_query)
+    def test_result_type(self):
+        '''Search results should be a dictionary.'''
+        self.assertIsInstance(self.search_results, dict)
+    def test_no_error(self):
+        '''Search results should not contain an error.'''
+        self.assertNotIn('error', self.search_results)
+    def test_result_content(self):
+        '''Each search result should contain title, url, and description if results found.'''
+        if len(self.search_results) > 0 and 'error' not in self.search_results:
+            for result in self.search_results.values():
+                self.assertIsInstance(result, dict)
+                self.assertIn('title', result)
+                self.assertIn('url', result)
+                self.assertIn('description', result)
+                self.assertIsInstance(result['title'], str)
+                self.assertIsInstance(result['url'], str)
+                self.assertIsInstance(result['description'], str)
+    def test_first_result_exists(self):
+        '''If results are found, the first result should have a meaningful title.'''
+        if len(self.search_results) > 0 and 'error' not in self.search_results:
+            first_result = next(iter(self.search_results.values()))
+            self.assertTrue(len(first_result['title']) > 3)
+    def test_result_urls_valid(self):
+        '''URLs should be properly formatted if present.'''
+        if len(self.search_results) > 0 and 'error' not in self.search_results:
+            for result in self.search_results.values():
+                if result['url']:  # Only test non-empty URLs
+                    self.assertTrue(
+                        result['url'].startswith('http://') or
+                        result['url'].startswith('https://') or
+                        result['url'].startswith('/')
+                    )