unit-four-final-project

Sleeping

App Files Files Community

gperdrizet commited on Jul 1

Commit

3517095

verified ·

1 Parent(s): a5e5840

Added chapter parser function.

Browse files

Files changed (4) hide show

app.py +1 -2
functions/agent.py +2 -6
functions/tools.py +177 -15
tests/test_tools.py +119 -2

app.py CHANGED Viewed

@@ -271,9 +271,8 @@ if __name__ == "__main__":
         logger.info("   Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
         logger.info(
             "   Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
-              space_id_startup
         )
     else:
         logger.info(
             "ℹ️  SPACE_ID environment variable not found (running locally?). " \

         logger.info("   Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
         logger.info(
             "   Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
+            space_id_startup
         )
     else:
         logger.info(
             "ℹ️  SPACE_ID environment variable not found (running locally?). " \

functions/agent.py CHANGED Viewed

@@ -82,10 +82,7 @@ def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
             new_messages = [agent.memory.steps[-1].model_input_messages[0]]
             new_messages.append({
                 'role': MessageRole.USER,
-                'content': [{
-                    'type': 'text',
-                    'text': f'Here is a summary of your investigation so far: {summary}'
-                }]
             })
             agent.memory.steps = [agent.memory.steps[0]]
             agent.memory.steps[0].model_input_messages = new_messages
@@ -110,8 +107,7 @@ def summarize_old_messages(messages: dict) -> dict:
     messages = [
         {
             'role': 'system',
-            'content': ('Summarize the following interaction between an AI agent and a user. ' +
-                f'Return the summary formatted as text, not as JSON: {json.dumps(messages)}')
         }
     ]

             new_messages = [agent.memory.steps[-1].model_input_messages[0]]
             new_messages.append({
                 'role': MessageRole.USER,
+                'content': [{'type': 'text', 'text': f'Here is a summary of your investigation so far: {summary}'}]
             })
             agent.memory.steps = [agent.memory.steps[0]]
             agent.memory.steps[0].model_input_messages = new_messages
     messages = [
         {
             'role': 'system',
+            'content': f'Summarize the following interaction between an AI agent and a user. Return the summary formatted as text, not as JSON: {json.dumps(messages)}'
         }
     ]

functions/tools.py CHANGED Viewed

@@ -114,7 +114,7 @@ def get_wikipedia_page(query: str) -> str:
     content = content.split(
         '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
     )[0]
     content = content.split(
         '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
     )[0]
@@ -383,7 +383,7 @@ class WikipediaFetcher:
 @tool
-def libretext_book_parser(url: str) -> str:
     """
     Parse the content of a LibreTexts book and return table of contents as JSON.
@@ -391,12 +391,88 @@ def libretext_book_parser(url: str) -> str:
         url (str): The URL of the LibreTexts book page.
     Returns:
-        dict: A dictionary containing the table of contents in JSON format.
     """
-    logger.debug(url)
-    return "LibreTexts book parser is not yet implemented."
 @tool
 def libretext_book_search(query: str) -> dict:
@@ -418,10 +494,8 @@ def libretext_book_search(query: str) -> dict:
     chrome_options.add_argument("--disable-dev-shm-usage")
     chrome_options.add_argument("--disable-gpu")
     chrome_options.add_argument("--window-size=1920,1080")
-    chrome_options.add_argument(
-        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
-        "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    )
     driver = None
     try:
@@ -464,10 +538,6 @@ def libretext_book_search(query: str) -> dict:
             page_source = driver.page_source
             soup = BeautifulSoup(page_source, 'html.parser')
-            # Save the rendered HTML for debugging
-            with open('selenium_test.html', 'w', encoding='utf-8') as f:
-                f.write(soup.prettify())
             # Look for search results using multiple possible selectors
             search_info_divs = soup.find_all('div', class_='mt-search-information')
@@ -542,7 +612,7 @@ def libretext_book_search(query: str) -> dict:
         logger.error('WebDriver error: %s', str(e))
         return {'error': f'WebDriver error: {str(e)}'}
-    except Exception as e: # pylint: disable=broad-exception-caught
         logger.error('Unexpected error in Selenium search: %s', str(e))
         return {'error': f'Unexpected error: {str(e)}'}
@@ -551,5 +621,97 @@ def libretext_book_search(query: str) -> dict:
         if driver:
             try:
                 driver.quit()
-            except Exception as e: # pylint: disable=broad-exception-caught
                 logger.warning('Error closing driver: %s', str(e))

     content = content.split(
         '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
     )[0]
     content = content.split(
         '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
     )[0]
 @tool
+def libretext_book_parser(url: str) -> dict:
     """
     Parse the content of a LibreTexts book and return table of contents as JSON.
         url (str): The URL of the LibreTexts book page.
     Returns:
+        dict: A dictionary containing the table of contents in the following format.
+        {0: {'title': str, 'url': str, 'description': str}, ...}
     """
+    logger.info('Parsing LibreTexts book: %s', url)
+    # Set up headers to mimic a real browser
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
+            '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+    }
+    try:
+        # Fetch the book page
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        # Parse the HTML content
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Save the HTML for debugging if needed
+        with open('book_parser_debug.html', 'w', encoding='utf-8') as f:
+            f.write(soup.prettify())
+        # Look for the table of contents structure
+        # LibreTexts books typically use li elements with class 'mt-sortable-listing'
+        chapter_listings = soup.find_all('li', class_='mt-sortable-listing')
+        logger.info('Found %d potential chapter listings', len(chapter_listings))
+        parsed_chapters = {}
+        chapter_count = 0
+        for listing in chapter_listings:
+            # Extract the link element
+            link = listing.find('a', class_='mt-sortable-listing-link')
+            if link:
+                # Extract title from the span with class 'mt-sortable-listing-title'
+                title_span = link.find('span', class_='mt-sortable-listing-title')
+                title = title_span.get_text(strip=True) if title_span else ''
+                # Extract URL from href attribute
+                chapter_url = link.get('href', '')
+                # Extract description from the title attribute of the link
+                description = link.get('title', '')
+                # Clean up description - remove the title prefix if it appears
+                if description and title and description.startswith(title):
+                    description = description[len(title):].strip()
+                    if description.startswith(':'):
+                        description = description[1:].strip()
+                # Only add meaningful chapters (skip empty titles or very short ones)
+                if title and len(title) > 2:
+                    parsed_chapters[chapter_count] = {
+                        'title': title,
+                        'url': chapter_url,
+                        'description': description
+                    }
+                    logger.debug('Extracted chapter %d: title="%s", url="%s"',
+                               chapter_count, title, chapter_url)
+                    chapter_count += 1
+        logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
+        print(parsed_chapters)
+        return parsed_chapters
+    except requests.exceptions.RequestException as e:
+        logger.error('Request error while fetching book page: %s', str(e))
+        return {'error': f'Request error: {str(e)}'}
+    except Exception as e: # pylint:disable=broad-exception-caught
+        logger.error('Unexpected error in book parser: %s', str(e))
+        return {'error': f'Unexpected error: {str(e)}'}
 @tool
 def libretext_book_search(query: str) -> dict:
     chrome_options.add_argument("--disable-dev-shm-usage")
     chrome_options.add_argument("--disable-gpu")
     chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
+        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
     driver = None
     try:
             page_source = driver.page_source
             soup = BeautifulSoup(page_source, 'html.parser')
             # Look for search results using multiple possible selectors
             search_info_divs = soup.find_all('div', class_='mt-search-information')
         logger.error('WebDriver error: %s', str(e))
         return {'error': f'WebDriver error: {str(e)}'}
+    except Exception as e: # pylint:disable=broad-exception-caught
         logger.error('Unexpected error in Selenium search: %s', str(e))
         return {'error': f'Unexpected error: {str(e)}'}
         if driver:
             try:
                 driver.quit()
+            except Exception as e: # pylint:disable=broad-exception-caught
                 logger.warning('Error closing driver: %s', str(e))
+@tool
+def libretext_chapter_parser(url: str) -> dict:
+    """
+    Parse the content of a LibreTexts chapter and return section headings as JSON.
+    Args:
+        url (str): The URL of the LibreTexts chapter page.
+    Returns:
+        dict: A dictionary containing the section headings in the following format.
+        {0: {'title': str, 'url': str, 'description': str}, ...}
+    """
+    logger.info('Parsing LibreTexts chapter: %s', url)
+    # Set up headers to mimic a real browser
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
+            '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+    }
+    try:
+        # Fetch the chapter page
+        response = requests.get(url, headers=headers, timeout=15)
+        response.raise_for_status()
+        # Parse the HTML content
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Look for the section structure
+        # LibreTexts chapters typically use li elements with class 'mt-list-topics'
+        section_listings = soup.find_all('li', class_='mt-list-topics')
+        logger.info('Found %d potential section listings', len(section_listings))
+        parsed_sections = {}
+        section_count = 0
+        for listing in section_listings:
+            # Look for the detailed listing structure
+            dl_element = listing.find('dl', class_='mt-listing-detailed')
+            if dl_element:
+                # Extract title and URL from the dt element
+                dt_element = dl_element.find('dt', class_='mt-listing-detailed-title')
+                dd_element = dl_element.find('dd', class_='mt-listing-detailed-overview')
+                if dt_element:
+                    # Find the anchor tag within the dt element
+                    link = dt_element.find('a')
+                    if link:
+                        # Extract title from the link text
+                        title = link.get_text(strip=True)
+                        # Extract URL from href attribute
+                        section_url = link.get('href', '')
+                        # Extract description from the dd element
+                        description = ''
+                        if dd_element:
+                            description = dd_element.get_text(strip=True)
+                        # Only add meaningful sections (skip empty titles or very short ones)
+                        if title and len(title) > 2:
+                            parsed_sections[section_count] = {
+                                'title': title,
+                                'url': section_url,
+                                'description': description
+                            }
+                            logger.debug('Extracted section %d: title="%s", url="%s"',
+                                       section_count, title, section_url)
+                            section_count += 1
+        logger.info('Successfully extracted %d sections from chapter', len(parsed_sections))
+        print(parsed_sections)
+        return parsed_sections
+    except requests.exceptions.RequestException as e:
+        logger.error('Request error while fetching chapter page: %s', str(e))
+        return {'error': f'Request error: {str(e)}'}
+    except Exception as e:  # pylint:disable=broad-exception-caught
+        logger.error('Unexpected error in chapter parser: %s', str(e))
+        return {'error': f'Unexpected error: {str(e)}'}

tests/test_tools.py CHANGED Viewed

@@ -5,7 +5,9 @@ from functions.tools import (
     google_search,
     wikipedia_search,
     get_wikipedia_page,
-    libretext_book_search
 )
@@ -137,7 +139,122 @@ class TestLibretextBookSearch(unittest.TestCase):
             for result in self.search_results.values():
                 if result['url']:  # Only test non-empty URLs
                     self.assertTrue(
-                        result['url'].startswith('http://') or
                         result['url'].startswith('https://') or
                         result['url'].startswith('/')
                     )

     google_search,
     wikipedia_search,
     get_wikipedia_page,
+    libretext_book_search,
+    libretext_book_parser,
+    libretext_chapter_parser
 )
             for result in self.search_results.values():
                 if result['url']:  # Only test non-empty URLs
                     self.assertTrue(
+                        result['url'].startswith('http://') or
                         result['url'].startswith('https://') or
                         result['url'].startswith('/')
                     )
+class TestLibretextBookParser(unittest.TestCase):
+    '''Tests for the libretext_book_parser tool.'''
+    def setUp(self):
+        # Use a known LibreTexts book URL for testing
+        book_url = 'https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/Introductory_Chemistry_(CK-12)'
+        self.parse_results = libretext_book_parser(book_url)
+    def test_result_type(self):
+        '''Parse results should be a dictionary.'''
+        self.assertIsInstance(self.parse_results, dict)
+    def test_no_error(self):
+        '''Parse results should not contain an error.'''
+        self.assertNotIn('error', self.parse_results)
+    def test_result_content(self):
+        '''Each chapter should contain title, url, and description if chapters found.'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for chapter in self.parse_results.values():
+                self.assertIsInstance(chapter, dict)
+                self.assertIn('title', chapter)
+                self.assertIn('url', chapter)
+                self.assertIn('description', chapter)
+                self.assertIsInstance(chapter['title'], str)
+                self.assertIsInstance(chapter['url'], str)
+                self.assertIsInstance(chapter['description'], str)
+    def test_chapters_found(self):
+        '''Should find multiple chapters in a typical LibreTexts book.'''
+        if 'error' not in self.parse_results:
+            self.assertGreater(len(self.parse_results), 5)  # Expect at least several chapters
+    def test_chapter_titles_meaningful(self):
+        '''Chapter titles should be meaningful (not empty or too short).'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for chapter in self.parse_results.values():
+                self.assertTrue(len(chapter['title']) > 2)
+    def test_chapter_urls_valid(self):
+        '''Chapter URLs should be properly formatted.'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for chapter in self.parse_results.values():
+                if chapter['url']:  # Only test non-empty URLs
+                    self.assertTrue(
+                        chapter['url'].startswith('http://') or
+                        chapter['url'].startswith('https://') or
+                        chapter['url'].startswith('/')
+                    )
+class TestLibretextChapterParser(unittest.TestCase):
+    '''Tests for the libretext_chapter_parser tool.'''
+    def setUp(self):
+        # Use a known LibreTexts chapter URL for testing
+        chapter_url = 'https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/Introductory_Chemistry_(CK-12)/01%3A_Introduction_to_Chemistry'
+        self.parse_results = libretext_chapter_parser(chapter_url)
+    def test_result_type(self):
+        '''Parse results should be a dictionary.'''
+        self.assertIsInstance(self.parse_results, dict)
+    def test_no_error(self):
+        '''Parse results should not contain an error.'''
+        self.assertNotIn('error', self.parse_results)
+    def test_result_content(self):
+        '''Each section should contain title, url, and description if sections found.'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for section in self.parse_results.values():
+                self.assertIsInstance(section, dict)
+                self.assertIn('title', section)
+                self.assertIn('url', section)
+                self.assertIn('description', section)
+                self.assertIsInstance(section['title'], str)
+                self.assertIsInstance(section['url'], str)
+                self.assertIsInstance(section['description'], str)
+    def test_sections_found(self):
+        '''Should find multiple sections in a typical LibreTexts chapter.'''
+        if 'error' not in self.parse_results:
+            self.assertGreater(len(self.parse_results), 2)  # Expect at least a few sections
+    def test_section_titles_meaningful(self):
+        '''Section titles should be meaningful (not empty or too short).'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for section in self.parse_results.values():
+                self.assertTrue(len(section['title']) > 2)
+    def test_section_urls_valid(self):
+        '''Section URLs should be properly formatted.'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            for section in self.parse_results.values():
+                if section['url']:  # Only test non-empty URLs
+                    self.assertTrue(
+                        section['url'].startswith('http://') or
+                        section['url'].startswith('https://') or
+                        section['url'].startswith('/')
+                    )
+    def test_sections_have_descriptions(self):
+        '''Most sections should have meaningful descriptions.'''
+        if len(self.parse_results) > 0 and 'error' not in self.parse_results:
+            sections_with_descriptions = sum(
+                1 for section in self.parse_results.values()
+                if section['description'] and len(section['description']) > 10
+            )
+            # At least half the sections should have descriptions
+            self.assertGreater(sections_with_descriptions, len(self.parse_results) // 2)
+if __name__ == '__main__':
+    unittest.main()