gperdrizet commited on
Commit
3517095
·
verified ·
1 Parent(s): a5e5840

Added chapter parser function.

Browse files
Files changed (4) hide show
  1. app.py +1 -2
  2. functions/agent.py +2 -6
  3. functions/tools.py +177 -15
  4. tests/test_tools.py +119 -2
app.py CHANGED
@@ -271,9 +271,8 @@ if __name__ == "__main__":
271
  logger.info(" Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
272
  logger.info(
273
  " Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
274
- space_id_startup
275
  )
276
-
277
  else:
278
  logger.info(
279
  "ℹ️ SPACE_ID environment variable not found (running locally?). " \
 
271
  logger.info(" Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
272
  logger.info(
273
  " Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
274
+ space_id_startup
275
  )
 
276
  else:
277
  logger.info(
278
  "ℹ️ SPACE_ID environment variable not found (running locally?). " \
functions/agent.py CHANGED
@@ -82,10 +82,7 @@ def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
82
  new_messages = [agent.memory.steps[-1].model_input_messages[0]]
83
  new_messages.append({
84
  'role': MessageRole.USER,
85
- 'content': [{
86
- 'type': 'text',
87
- 'text': f'Here is a summary of your investigation so far: {summary}'
88
- }]
89
  })
90
  agent.memory.steps = [agent.memory.steps[0]]
91
  agent.memory.steps[0].model_input_messages = new_messages
@@ -110,8 +107,7 @@ def summarize_old_messages(messages: dict) -> dict:
110
  messages = [
111
  {
112
  'role': 'system',
113
- 'content': ('Summarize the following interaction between an AI agent and a user. ' +
114
- f'Return the summary formatted as text, not as JSON: {json.dumps(messages)}')
115
  }
116
  ]
117
 
 
82
  new_messages = [agent.memory.steps[-1].model_input_messages[0]]
83
  new_messages.append({
84
  'role': MessageRole.USER,
85
+ 'content': [{'type': 'text', 'text': f'Here is a summary of your investigation so far: {summary}'}]
 
 
 
86
  })
87
  agent.memory.steps = [agent.memory.steps[0]]
88
  agent.memory.steps[0].model_input_messages = new_messages
 
107
  messages = [
108
  {
109
  'role': 'system',
110
+ 'content': f'Summarize the following interaction between an AI agent and a user. Return the summary formatted as text, not as JSON: {json.dumps(messages)}'
 
111
  }
112
  ]
113
 
functions/tools.py CHANGED
@@ -114,7 +114,7 @@ def get_wikipedia_page(query: str) -> str:
114
  content = content.split(
115
  '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
116
  )[0]
117
-
118
  content = content.split(
119
  '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
120
  )[0]
@@ -383,7 +383,7 @@ class WikipediaFetcher:
383
 
384
 
385
  @tool
386
- def libretext_book_parser(url: str) -> str:
387
  """
388
  Parse the content of a LibreTexts book and return table of contents as JSON.
389
 
@@ -391,12 +391,88 @@ def libretext_book_parser(url: str) -> str:
391
  url (str): The URL of the LibreTexts book page.
392
 
393
  Returns:
394
- dict: A dictionary containing the table of contents in JSON format.
 
395
  """
396
 
397
- logger.debug(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
- return "LibreTexts book parser is not yet implemented."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  @tool
402
  def libretext_book_search(query: str) -> dict:
@@ -418,10 +494,8 @@ def libretext_book_search(query: str) -> dict:
418
  chrome_options.add_argument("--disable-dev-shm-usage")
419
  chrome_options.add_argument("--disable-gpu")
420
  chrome_options.add_argument("--window-size=1920,1080")
421
- chrome_options.add_argument(
422
- "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
423
- "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
424
- )
425
 
426
  driver = None
427
  try:
@@ -464,10 +538,6 @@ def libretext_book_search(query: str) -> dict:
464
  page_source = driver.page_source
465
  soup = BeautifulSoup(page_source, 'html.parser')
466
 
467
- # Save the rendered HTML for debugging
468
- with open('selenium_test.html', 'w', encoding='utf-8') as f:
469
- f.write(soup.prettify())
470
-
471
  # Look for search results using multiple possible selectors
472
  search_info_divs = soup.find_all('div', class_='mt-search-information')
473
 
@@ -542,7 +612,7 @@ def libretext_book_search(query: str) -> dict:
542
  logger.error('WebDriver error: %s', str(e))
543
  return {'error': f'WebDriver error: {str(e)}'}
544
 
545
- except Exception as e: # pylint: disable=broad-exception-caught
546
  logger.error('Unexpected error in Selenium search: %s', str(e))
547
  return {'error': f'Unexpected error: {str(e)}'}
548
 
@@ -551,5 +621,97 @@ def libretext_book_search(query: str) -> dict:
551
  if driver:
552
  try:
553
  driver.quit()
554
- except Exception as e: # pylint: disable=broad-exception-caught
555
  logger.warning('Error closing driver: %s', str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  content = content.split(
115
  '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
116
  )[0]
117
+
118
  content = content.split(
119
  '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
120
  )[0]
 
383
 
384
 
385
  @tool
386
+ def libretext_book_parser(url: str) -> dict:
387
  """
388
  Parse the content of a LibreTexts book and return table of contents as JSON.
389
 
 
391
  url (str): The URL of the LibreTexts book page.
392
 
393
  Returns:
394
+ dict: A dictionary containing the table of contents in the following format.
395
+ {0: {'title': str, 'url': str, 'description': str}, ...}
396
  """
397
 
398
+ logger.info('Parsing LibreTexts book: %s', url)
399
+
400
+ # Set up headers to mimic a real browser
401
+ headers = {
402
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
403
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
404
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
405
+ 'Accept-Language': 'en-US,en;q=0.5',
406
+ 'Accept-Encoding': 'gzip, deflate',
407
+ 'Connection': 'keep-alive',
408
+ 'Upgrade-Insecure-Requests': '1',
409
+ }
410
+
411
+ try:
412
+ # Fetch the book page
413
+ response = requests.get(url, headers=headers, timeout=15)
414
+ response.raise_for_status()
415
+
416
+ # Parse the HTML content
417
+ soup = BeautifulSoup(response.content, 'html.parser')
418
+
419
+ # Save the HTML for debugging if needed
420
+ with open('book_parser_debug.html', 'w', encoding='utf-8') as f:
421
+ f.write(soup.prettify())
422
+
423
+ # Look for the table of contents structure
424
+ # LibreTexts books typically use li elements with class 'mt-sortable-listing'
425
+ chapter_listings = soup.find_all('li', class_='mt-sortable-listing')
426
+
427
+ logger.info('Found %d potential chapter listings', len(chapter_listings))
428
+
429
+ parsed_chapters = {}
430
+ chapter_count = 0
431
+
432
+ for listing in chapter_listings:
433
+ # Extract the link element
434
+ link = listing.find('a', class_='mt-sortable-listing-link')
435
+
436
+ if link:
437
+ # Extract title from the span with class 'mt-sortable-listing-title'
438
+ title_span = link.find('span', class_='mt-sortable-listing-title')
439
+ title = title_span.get_text(strip=True) if title_span else ''
440
 
441
+ # Extract URL from href attribute
442
+ chapter_url = link.get('href', '')
443
+
444
+ # Extract description from the title attribute of the link
445
+ description = link.get('title', '')
446
+
447
+ # Clean up description - remove the title prefix if it appears
448
+ if description and title and description.startswith(title):
449
+ description = description[len(title):].strip()
450
+ if description.startswith(':'):
451
+ description = description[1:].strip()
452
+
453
+ # Only add meaningful chapters (skip empty titles or very short ones)
454
+ if title and len(title) > 2:
455
+ parsed_chapters[chapter_count] = {
456
+ 'title': title,
457
+ 'url': chapter_url,
458
+ 'description': description
459
+ }
460
+
461
+ logger.debug('Extracted chapter %d: title="%s", url="%s"',
462
+ chapter_count, title, chapter_url)
463
+ chapter_count += 1
464
+
465
+ logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
466
+ print(parsed_chapters)
467
+ return parsed_chapters
468
+
469
+ except requests.exceptions.RequestException as e:
470
+ logger.error('Request error while fetching book page: %s', str(e))
471
+ return {'error': f'Request error: {str(e)}'}
472
+
473
+ except Exception as e: # pylint:disable=broad-exception-caught
474
+ logger.error('Unexpected error in book parser: %s', str(e))
475
+ return {'error': f'Unexpected error: {str(e)}'}
476
 
477
  @tool
478
  def libretext_book_search(query: str) -> dict:
 
494
  chrome_options.add_argument("--disable-dev-shm-usage")
495
  chrome_options.add_argument("--disable-gpu")
496
  chrome_options.add_argument("--window-size=1920,1080")
497
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
498
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
 
 
499
 
500
  driver = None
501
  try:
 
538
  page_source = driver.page_source
539
  soup = BeautifulSoup(page_source, 'html.parser')
540
 
 
 
 
 
541
  # Look for search results using multiple possible selectors
542
  search_info_divs = soup.find_all('div', class_='mt-search-information')
543
 
 
612
  logger.error('WebDriver error: %s', str(e))
613
  return {'error': f'WebDriver error: {str(e)}'}
614
 
615
+ except Exception as e: # pylint:disable=broad-exception-caught
616
  logger.error('Unexpected error in Selenium search: %s', str(e))
617
  return {'error': f'Unexpected error: {str(e)}'}
618
 
 
621
  if driver:
622
  try:
623
  driver.quit()
624
+ except Exception as e: # pylint:disable=broad-exception-caught
625
  logger.warning('Error closing driver: %s', str(e))
626
+
627
+ @tool
628
+ def libretext_chapter_parser(url: str) -> dict:
629
+ """
630
+ Parse the content of a LibreTexts chapter and return section headings as JSON.
631
+
632
+ Args:
633
+ url (str): The URL of the LibreTexts chapter page.
634
+
635
+ Returns:
636
+ dict: A dictionary containing the section headings in the following format.
637
+ {0: {'title': str, 'url': str, 'description': str}, ...}
638
+ """
639
+
640
+ logger.info('Parsing LibreTexts chapter: %s', url)
641
+
642
+ # Set up headers to mimic a real browser
643
+ headers = {
644
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
645
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
646
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
647
+ 'Accept-Language': 'en-US,en;q=0.5',
648
+ 'Accept-Encoding': 'gzip, deflate',
649
+ 'Connection': 'keep-alive',
650
+ 'Upgrade-Insecure-Requests': '1',
651
+ }
652
+
653
+ try:
654
+ # Fetch the chapter page
655
+ response = requests.get(url, headers=headers, timeout=15)
656
+ response.raise_for_status()
657
+
658
+ # Parse the HTML content
659
+ soup = BeautifulSoup(response.content, 'html.parser')
660
+
661
+ # Look for the section structure
662
+ # LibreTexts chapters typically use li elements with class 'mt-list-topics'
663
+ section_listings = soup.find_all('li', class_='mt-list-topics')
664
+
665
+ logger.info('Found %d potential section listings', len(section_listings))
666
+
667
+ parsed_sections = {}
668
+ section_count = 0
669
+
670
+ for listing in section_listings:
671
+ # Look for the detailed listing structure
672
+ dl_element = listing.find('dl', class_='mt-listing-detailed')
673
+
674
+ if dl_element:
675
+ # Extract title and URL from the dt element
676
+ dt_element = dl_element.find('dt', class_='mt-listing-detailed-title')
677
+ dd_element = dl_element.find('dd', class_='mt-listing-detailed-overview')
678
+
679
+ if dt_element:
680
+ # Find the anchor tag within the dt element
681
+ link = dt_element.find('a')
682
+
683
+ if link:
684
+ # Extract title from the link text
685
+ title = link.get_text(strip=True)
686
+
687
+ # Extract URL from href attribute
688
+ section_url = link.get('href', '')
689
+
690
+ # Extract description from the dd element
691
+ description = ''
692
+ if dd_element:
693
+ description = dd_element.get_text(strip=True)
694
+
695
+ # Only add meaningful sections (skip empty titles or very short ones)
696
+ if title and len(title) > 2:
697
+ parsed_sections[section_count] = {
698
+ 'title': title,
699
+ 'url': section_url,
700
+ 'description': description
701
+ }
702
+
703
+ logger.debug('Extracted section %d: title="%s", url="%s"',
704
+ section_count, title, section_url)
705
+ section_count += 1
706
+
707
+ logger.info('Successfully extracted %d sections from chapter', len(parsed_sections))
708
+ print(parsed_sections)
709
+ return parsed_sections
710
+
711
+ except requests.exceptions.RequestException as e:
712
+ logger.error('Request error while fetching chapter page: %s', str(e))
713
+ return {'error': f'Request error: {str(e)}'}
714
+
715
+ except Exception as e: # pylint:disable=broad-exception-caught
716
+ logger.error('Unexpected error in chapter parser: %s', str(e))
717
+ return {'error': f'Unexpected error: {str(e)}'}
tests/test_tools.py CHANGED
@@ -5,7 +5,9 @@ from functions.tools import (
5
  google_search,
6
  wikipedia_search,
7
  get_wikipedia_page,
8
- libretext_book_search
 
 
9
  )
10
 
11
 
@@ -137,7 +139,122 @@ class TestLibretextBookSearch(unittest.TestCase):
137
  for result in self.search_results.values():
138
  if result['url']: # Only test non-empty URLs
139
  self.assertTrue(
140
- result['url'].startswith('http://') or
141
  result['url'].startswith('https://') or
142
  result['url'].startswith('/')
143
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  google_search,
6
  wikipedia_search,
7
  get_wikipedia_page,
8
+ libretext_book_search,
9
+ libretext_book_parser,
10
+ libretext_chapter_parser
11
  )
12
 
13
 
 
139
  for result in self.search_results.values():
140
  if result['url']: # Only test non-empty URLs
141
  self.assertTrue(
142
+ result['url'].startswith('http://') or
143
  result['url'].startswith('https://') or
144
  result['url'].startswith('/')
145
  )
146
+
147
+
148
+ class TestLibretextBookParser(unittest.TestCase):
149
+ '''Tests for the libretext_book_parser tool.'''
150
+
151
+ def setUp(self):
152
+ # Use a known LibreTexts book URL for testing
153
+ book_url = 'https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/Introductory_Chemistry_(CK-12)'
154
+ self.parse_results = libretext_book_parser(book_url)
155
+
156
+ def test_result_type(self):
157
+ '''Parse results should be a dictionary.'''
158
+ self.assertIsInstance(self.parse_results, dict)
159
+
160
+ def test_no_error(self):
161
+ '''Parse results should not contain an error.'''
162
+ self.assertNotIn('error', self.parse_results)
163
+
164
+ def test_result_content(self):
165
+ '''Each chapter should contain title, url, and description if chapters found.'''
166
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
167
+ for chapter in self.parse_results.values():
168
+ self.assertIsInstance(chapter, dict)
169
+ self.assertIn('title', chapter)
170
+ self.assertIn('url', chapter)
171
+ self.assertIn('description', chapter)
172
+ self.assertIsInstance(chapter['title'], str)
173
+ self.assertIsInstance(chapter['url'], str)
174
+ self.assertIsInstance(chapter['description'], str)
175
+
176
+ def test_chapters_found(self):
177
+ '''Should find multiple chapters in a typical LibreTexts book.'''
178
+ if 'error' not in self.parse_results:
179
+ self.assertGreater(len(self.parse_results), 5) # Expect at least several chapters
180
+
181
+ def test_chapter_titles_meaningful(self):
182
+ '''Chapter titles should be meaningful (not empty or too short).'''
183
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
184
+ for chapter in self.parse_results.values():
185
+ self.assertTrue(len(chapter['title']) > 2)
186
+
187
+ def test_chapter_urls_valid(self):
188
+ '''Chapter URLs should be properly formatted.'''
189
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
190
+ for chapter in self.parse_results.values():
191
+ if chapter['url']: # Only test non-empty URLs
192
+ self.assertTrue(
193
+ chapter['url'].startswith('http://') or
194
+ chapter['url'].startswith('https://') or
195
+ chapter['url'].startswith('/')
196
+ )
197
+
198
+
199
+ class TestLibretextChapterParser(unittest.TestCase):
200
+ '''Tests for the libretext_chapter_parser tool.'''
201
+
202
+ def setUp(self):
203
+ # Use a known LibreTexts chapter URL for testing
204
+ chapter_url = 'https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/Introductory_Chemistry_(CK-12)/01%3A_Introduction_to_Chemistry'
205
+ self.parse_results = libretext_chapter_parser(chapter_url)
206
+
207
+ def test_result_type(self):
208
+ '''Parse results should be a dictionary.'''
209
+ self.assertIsInstance(self.parse_results, dict)
210
+
211
+ def test_no_error(self):
212
+ '''Parse results should not contain an error.'''
213
+ self.assertNotIn('error', self.parse_results)
214
+
215
+ def test_result_content(self):
216
+ '''Each section should contain title, url, and description if sections found.'''
217
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
218
+ for section in self.parse_results.values():
219
+ self.assertIsInstance(section, dict)
220
+ self.assertIn('title', section)
221
+ self.assertIn('url', section)
222
+ self.assertIn('description', section)
223
+ self.assertIsInstance(section['title'], str)
224
+ self.assertIsInstance(section['url'], str)
225
+ self.assertIsInstance(section['description'], str)
226
+
227
+ def test_sections_found(self):
228
+ '''Should find multiple sections in a typical LibreTexts chapter.'''
229
+ if 'error' not in self.parse_results:
230
+ self.assertGreater(len(self.parse_results), 2) # Expect at least a few sections
231
+
232
+ def test_section_titles_meaningful(self):
233
+ '''Section titles should be meaningful (not empty or too short).'''
234
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
235
+ for section in self.parse_results.values():
236
+ self.assertTrue(len(section['title']) > 2)
237
+
238
+ def test_section_urls_valid(self):
239
+ '''Section URLs should be properly formatted.'''
240
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
241
+ for section in self.parse_results.values():
242
+ if section['url']: # Only test non-empty URLs
243
+ self.assertTrue(
244
+ section['url'].startswith('http://') or
245
+ section['url'].startswith('https://') or
246
+ section['url'].startswith('/')
247
+ )
248
+
249
+ def test_sections_have_descriptions(self):
250
+ '''Most sections should have meaningful descriptions.'''
251
+ if len(self.parse_results) > 0 and 'error' not in self.parse_results:
252
+ sections_with_descriptions = sum(
253
+ 1 for section in self.parse_results.values()
254
+ if section['description'] and len(section['description']) > 10
255
+ )
256
+ # At least half the sections should have descriptions
257
+ self.assertGreater(sections_with_descriptions, len(self.parse_results) // 2)
258
+
259
+ if __name__ == '__main__':
260
+ unittest.main()