gperdrizet commited on
Commit
75bb385
·
verified ·
1 Parent(s): 3517095

Added function to save chemistry textbook contents as markdown, refactored tool helper functions.

Browse files
functions/tool_helper_functions.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''Helper functions for GAIA question answering agent tools.'''
2
+
3
+ import time
4
+ import logging
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+
8
+ # Get logger for this module
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def libretext_book_parser(url: str) -> dict:
13
+ """
14
+ Parse the content of a LibreTexts book and return table of contents as JSON.
15
+
16
+ Args:
17
+ url (str): The URL of the LibreTexts book page.
18
+
19
+ Returns:
20
+ dict: A dictionary containing the table of contents in the following format.
21
+ {0: {'title': str, 'url': str, 'description': str}, ...}
22
+ """
23
+
24
+ logger.info('Parsing LibreTexts book: %s', url)
25
+
26
+ # Set up headers to mimic a real browser
27
+ headers = {
28
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
29
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
30
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
31
+ 'Accept-Language': 'en-US,en;q=0.5',
32
+ 'Accept-Encoding': 'gzip, deflate',
33
+ 'Connection': 'keep-alive',
34
+ 'Upgrade-Insecure-Requests': '1',
35
+ }
36
+
37
+ try:
38
+ # Fetch the book page
39
+ response = requests.get(url, headers=headers, timeout=15)
40
+ response.raise_for_status()
41
+
42
+ # Parse the HTML content
43
+ soup = BeautifulSoup(response.content, 'html.parser')
44
+
45
+ # Look for the table of contents structure
46
+ # LibreTexts books typically use li elements with class 'mt-sortable-listing'
47
+ chapter_listings = soup.find_all('li', class_='mt-sortable-listing')
48
+
49
+ logger.info('Found %d potential chapter listings', len(chapter_listings))
50
+
51
+ parsed_chapters = {}
52
+ chapter_count = 0
53
+
54
+ for listing in chapter_listings:
55
+ # Extract the link element
56
+ link = listing.find('a', class_='mt-sortable-listing-link')
57
+
58
+ if link:
59
+ # Extract title from the span with class 'mt-sortable-listing-title'
60
+ title_span = link.find('span', class_='mt-sortable-listing-title')
61
+ title = title_span.get_text(strip=True) if title_span else ''
62
+
63
+ # Extract URL from href attribute
64
+ chapter_url = link.get('href', '')
65
+
66
+ # Extract description from the title attribute of the link
67
+ description = link.get('title', '')
68
+
69
+ # Clean up description - remove the title prefix if it appears
70
+ if description and title and description.startswith(title):
71
+ description = description[len(title):].strip()
72
+ if description.startswith(':'):
73
+ description = description[1:].strip()
74
+
75
+ # Only add meaningful chapters (skip empty titles or very short ones)
76
+ if title and len(title) > 2:
77
+ parsed_chapters[chapter_count] = {
78
+ 'title': title,
79
+ 'url': chapter_url,
80
+ 'description': description
81
+ }
82
+
83
+ logger.debug('Extracted chapter %d: title="%s", url="%s"',
84
+ chapter_count, title, chapter_url)
85
+ chapter_count += 1
86
+
87
+ logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
88
+ return parsed_chapters
89
+
90
+ except requests.exceptions.RequestException as e:
91
+ logger.error('Request error while fetching book page: %s', str(e))
92
+ return {'error': f'Request error: {str(e)}'}
93
+
94
+ except Exception as e: # pylint:disable=broad-exception-caught
95
+ logger.error('Unexpected error in book parser: %s', str(e))
96
+ return {'error': f'Unexpected error: {str(e)}'}
97
+
98
+
99
+ def libretext_chapter_parser(url: str) -> dict:
100
+ """
101
+ Parse the content of a LibreTexts chapter and return section headings as JSON.
102
+
103
+ Args:
104
+ url (str): The URL of the LibreTexts chapter page.
105
+
106
+ Returns:
107
+ dict: A dictionary containing the section headings in the following format.
108
+ {0: {'title': str, 'url': str, 'description': str}, ...}
109
+ """
110
+
111
+ logger.info('Parsing LibreTexts chapter: %s', url)
112
+
113
+ # Set up headers to mimic a real browser
114
+ headers = {
115
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
116
+ '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
117
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
118
+ 'Accept-Language': 'en-US,en;q=0.5',
119
+ 'Accept-Encoding': 'gzip, deflate',
120
+ 'Connection': 'keep-alive',
121
+ 'Upgrade-Insecure-Requests': '1',
122
+ }
123
+
124
+ try:
125
+ # Fetch the chapter page
126
+ response = requests.get(url, headers=headers, timeout=15)
127
+ response.raise_for_status()
128
+
129
+ # Parse the HTML content
130
+ soup = BeautifulSoup(response.content, 'html.parser')
131
+
132
+ # Look for the section structure
133
+ # LibreTexts chapters typically use li elements with class 'mt-list-topics'
134
+ section_listings = soup.find_all('li', class_='mt-list-topics')
135
+
136
+ logger.info('Found %d potential section listings', len(section_listings))
137
+
138
+ parsed_sections = {}
139
+ section_count = 0
140
+
141
+ for listing in section_listings:
142
+ # Look for the detailed listing structure
143
+ dl_element = listing.find('dl', class_='mt-listing-detailed')
144
+
145
+ if dl_element:
146
+ # Extract title and URL from the dt element
147
+ dt_element = dl_element.find('dt', class_='mt-listing-detailed-title')
148
+ dd_element = dl_element.find('dd', class_='mt-listing-detailed-overview')
149
+
150
+ if dt_element:
151
+ # Find the anchor tag within the dt element
152
+ link = dt_element.find('a')
153
+
154
+ if link:
155
+ # Extract title from the link text
156
+ title = link.get_text(strip=True)
157
+
158
+ # Extract URL from href attribute
159
+ section_url = link.get('href', '')
160
+
161
+ # Extract description from the dd element
162
+ description = ''
163
+ if dd_element:
164
+ description = dd_element.get_text(strip=True)
165
+
166
+ # Only add meaningful sections (skip empty titles or very short ones)
167
+ if title and len(title) > 2:
168
+ parsed_sections[section_count] = {
169
+ 'title': title,
170
+ 'url': section_url,
171
+ 'description': description
172
+ }
173
+
174
+ logger.debug('Extracted section %d: title="%s", url="%s"',
175
+ section_count, title, section_url)
176
+ section_count += 1
177
+
178
+ logger.info('Successfully extracted %d sections from chapter', len(parsed_sections))
179
+ return parsed_sections
180
+
181
+ except requests.exceptions.RequestException as e:
182
+ logger.error('Request error while fetching chapter page: %s', str(e))
183
+ return {'error': f'Request error: {str(e)}'}
184
+
185
+ except Exception as e: # pylint:disable=broad-exception-caught
186
+ logger.error('Unexpected error in chapter parser: %s', str(e))
187
+ return {'error': f'Unexpected error: {str(e)}'}
188
+
189
+
190
+ def save_libretext_book_as_markdown(book_data: dict, filename: str = None, source_url: str = None) -> str:
191
+ """
192
+ Save a complete LibreTexts book dictionary as a markdown formatted file.
193
+
194
+ Args:
195
+ book_data (dict): The complete book data dictionary from get_libretext_book().
196
+ filename (str, optional): The filename to save the markdown. If not provided,
197
+ will generate based on book title.
198
+ source_url (str, optional): The original URL of the book for reference in the markdown.
199
+
200
+ Returns:
201
+ str: A message indicating success or failure with the filename used.
202
+ """
203
+
204
+ logger.info('Saving LibreTexts book as markdown')
205
+
206
+ if 'error' in book_data:
207
+ error_msg = f"Cannot save book with error: {book_data['error']}"
208
+ logger.error(error_msg)
209
+ return error_msg
210
+
211
+ # Generate filename if not provided
212
+ if filename is None:
213
+ book_title = book_data.get('title', 'LibreTexts_Book')
214
+ # Clean up the title for use as filename
215
+ safe_title = "".join(c for c in book_title if c.isalnum() or c in (' ', '-', '_')).rstrip()
216
+ safe_title = safe_title.replace(' ', '_')
217
+ filename = f"{safe_title}.md"
218
+
219
+ # Ensure .md extension
220
+ if not filename.endswith('.md'):
221
+ filename += '.md'
222
+
223
+ try:
224
+ # Format the book data as markdown
225
+ markdown_content = []
226
+
227
+ # Book title
228
+ book_title = book_data.get('title', 'LibreTexts Book')
229
+ markdown_content.append(f"# {book_title}\n")
230
+ if source_url:
231
+ markdown_content.append(f"*Extracted from: {source_url}*\n")
232
+ markdown_content.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n\n")
233
+
234
+ # Table of contents
235
+ chapters = book_data.get('chapters', {})
236
+ if chapters:
237
+ markdown_content.append("## Table of Contents\n")
238
+ for chapter_title in chapters.keys():
239
+ # Create anchor link for the chapter
240
+ anchor = chapter_title.lower().replace(' ', '-').replace(':', '').replace('(', '').replace(')', '')
241
+ markdown_content.append(f"- [{chapter_title}](#{anchor})\n")
242
+ markdown_content.append("\n---\n\n")
243
+
244
+ # Chapter content
245
+ for chapter_title, chapter_data in chapters.items():
246
+ # Chapter heading
247
+ markdown_content.append(f"## {chapter_title}\n\n")
248
+
249
+ sections = chapter_data.get('sections', {})
250
+
251
+ if not sections:
252
+ markdown_content.append("*No sections found for this chapter.*\n\n")
253
+ continue
254
+
255
+ # Section content
256
+ for section_title, section_data in sections.items():
257
+ # Section heading
258
+ markdown_content.append(f"### {section_title}\n\n")
259
+
260
+ # Section URL
261
+ section_url = section_data.get('Section url', '')
262
+ if section_url:
263
+ markdown_content.append(f"**URL:** [{section_url}]({section_url})\n\n")
264
+
265
+ # Section summary
266
+ section_summary = section_data.get('Section summary', '')
267
+ if section_summary:
268
+ markdown_content.append(f"{section_summary}\n\n")
269
+ else:
270
+ markdown_content.append("*No summary available.*\n\n")
271
+
272
+ markdown_content.append("---\n\n")
273
+
274
+ # Write to file
275
+ with open(filename, 'w', encoding='utf-8') as f:
276
+ f.writelines(markdown_content)
277
+
278
+ success_msg = f"Successfully saved LibreTexts book as markdown file: {filename}"
279
+ logger.info(success_msg)
280
+ return success_msg
281
+
282
+ except Exception as e: # pylint:disable=broad-exception-caught
283
+ error_msg = f"Error saving markdown file: {str(e)}"
284
+ logger.error(error_msg)
285
+ return error_msg
functions/tools.py CHANGED
@@ -14,6 +14,11 @@ from selenium.webdriver.support.ui import WebDriverWait
14
  from selenium.webdriver.support import expected_conditions as EC
15
  from selenium.webdriver.chrome.options import Options
16
  from selenium.common.exceptions import TimeoutException, WebDriverException
 
 
 
 
 
17
 
18
  # Get logger for this module
19
  logger = logging.getLogger(__name__)
@@ -382,98 +387,6 @@ class WikipediaFetcher:
382
  return html
383
 
384
 
385
- @tool
386
- def libretext_book_parser(url: str) -> dict:
387
- """
388
- Parse the content of a LibreTexts book and return table of contents as JSON.
389
-
390
- Args:
391
- url (str): The URL of the LibreTexts book page.
392
-
393
- Returns:
394
- dict: A dictionary containing the table of contents in the following format.
395
- {0: {'title': str, 'url': str, 'description': str}, ...}
396
- """
397
-
398
- logger.info('Parsing LibreTexts book: %s', url)
399
-
400
- # Set up headers to mimic a real browser
401
- headers = {
402
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
403
- '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
404
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
405
- 'Accept-Language': 'en-US,en;q=0.5',
406
- 'Accept-Encoding': 'gzip, deflate',
407
- 'Connection': 'keep-alive',
408
- 'Upgrade-Insecure-Requests': '1',
409
- }
410
-
411
- try:
412
- # Fetch the book page
413
- response = requests.get(url, headers=headers, timeout=15)
414
- response.raise_for_status()
415
-
416
- # Parse the HTML content
417
- soup = BeautifulSoup(response.content, 'html.parser')
418
-
419
- # Save the HTML for debugging if needed
420
- with open('book_parser_debug.html', 'w', encoding='utf-8') as f:
421
- f.write(soup.prettify())
422
-
423
- # Look for the table of contents structure
424
- # LibreTexts books typically use li elements with class 'mt-sortable-listing'
425
- chapter_listings = soup.find_all('li', class_='mt-sortable-listing')
426
-
427
- logger.info('Found %d potential chapter listings', len(chapter_listings))
428
-
429
- parsed_chapters = {}
430
- chapter_count = 0
431
-
432
- for listing in chapter_listings:
433
- # Extract the link element
434
- link = listing.find('a', class_='mt-sortable-listing-link')
435
-
436
- if link:
437
- # Extract title from the span with class 'mt-sortable-listing-title'
438
- title_span = link.find('span', class_='mt-sortable-listing-title')
439
- title = title_span.get_text(strip=True) if title_span else ''
440
-
441
- # Extract URL from href attribute
442
- chapter_url = link.get('href', '')
443
-
444
- # Extract description from the title attribute of the link
445
- description = link.get('title', '')
446
-
447
- # Clean up description - remove the title prefix if it appears
448
- if description and title and description.startswith(title):
449
- description = description[len(title):].strip()
450
- if description.startswith(':'):
451
- description = description[1:].strip()
452
-
453
- # Only add meaningful chapters (skip empty titles or very short ones)
454
- if title and len(title) > 2:
455
- parsed_chapters[chapter_count] = {
456
- 'title': title,
457
- 'url': chapter_url,
458
- 'description': description
459
- }
460
-
461
- logger.debug('Extracted chapter %d: title="%s", url="%s"',
462
- chapter_count, title, chapter_url)
463
- chapter_count += 1
464
-
465
- logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
466
- print(parsed_chapters)
467
- return parsed_chapters
468
-
469
- except requests.exceptions.RequestException as e:
470
- logger.error('Request error while fetching book page: %s', str(e))
471
- return {'error': f'Request error: {str(e)}'}
472
-
473
- except Exception as e: # pylint:disable=broad-exception-caught
474
- logger.error('Unexpected error in book parser: %s', str(e))
475
- return {'error': f'Unexpected error: {str(e)}'}
476
-
477
  @tool
478
  def libretext_book_search(query: str) -> dict:
479
  """
@@ -624,94 +537,99 @@ def libretext_book_search(query: str) -> dict:
624
  except Exception as e: # pylint:disable=broad-exception-caught
625
  logger.warning('Error closing driver: %s', str(e))
626
 
 
627
  @tool
628
- def libretext_chapter_parser(url: str) -> dict:
629
  """
630
- Parse the content of a LibreTexts chapter and return section headings as JSON.
631
 
632
  Args:
633
- url (str): The URL of the LibreTexts chapter page.
634
 
635
  Returns:
636
- dict: A dictionary containing the section headings in the following format.
637
- {0: {'title': str, 'url': str, 'description': str}, ...}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
638
  """
639
 
640
- logger.info('Parsing LibreTexts chapter: %s', url)
641
 
642
- # Set up headers to mimic a real browser
643
- headers = {
644
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
645
- '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
646
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
647
- 'Accept-Language': 'en-US,en;q=0.5',
648
- 'Accept-Encoding': 'gzip, deflate',
649
- 'Connection': 'keep-alive',
650
- 'Upgrade-Insecure-Requests': '1',
651
- }
652
 
653
- try:
654
- # Fetch the chapter page
655
- response = requests.get(url, headers=headers, timeout=15)
656
- response.raise_for_status()
657
 
658
- # Parse the HTML content
659
- soup = BeautifulSoup(response.content, 'html.parser')
 
 
660
 
661
- # Look for the section structure
662
- # LibreTexts chapters typically use li elements with class 'mt-list-topics'
663
- section_listings = soup.find_all('li', class_='mt-list-topics')
 
 
664
 
665
- logger.info('Found %d potential section listings', len(section_listings))
666
 
667
- parsed_sections = {}
668
- section_count = 0
 
 
669
 
670
- for listing in section_listings:
671
- # Look for the detailed listing structure
672
- dl_element = listing.find('dl', class_='mt-listing-detailed')
673
 
674
- if dl_element:
675
- # Extract title and URL from the dt element
676
- dt_element = dl_element.find('dt', class_='mt-listing-detailed-title')
677
- dd_element = dl_element.find('dd', class_='mt-listing-detailed-overview')
678
 
679
- if dt_element:
680
- # Find the anchor tag within the dt element
681
- link = dt_element.find('a')
 
682
 
683
- if link:
684
- # Extract title from the link text
685
- title = link.get_text(strip=True)
 
 
 
 
 
 
 
 
 
 
686
 
687
- # Extract URL from href attribute
688
- section_url = link.get('href', '')
 
 
689
 
690
- # Extract description from the dd element
691
- description = ''
692
- if dd_element:
693
- description = dd_element.get_text(strip=True)
694
 
695
- # Only add meaningful sections (skip empty titles or very short ones)
696
- if title and len(title) > 2:
697
- parsed_sections[section_count] = {
698
- 'title': title,
699
- 'url': section_url,
700
- 'description': description
701
- }
702
 
703
- logger.debug('Extracted section %d: title="%s", url="%s"',
704
- section_count, title, section_url)
705
- section_count += 1
706
 
707
- logger.info('Successfully extracted %d sections from chapter', len(parsed_sections))
708
- print(parsed_sections)
709
- return parsed_sections
710
 
711
- except requests.exceptions.RequestException as e:
712
- logger.error('Request error while fetching chapter page: %s', str(e))
713
- return {'error': f'Request error: {str(e)}'}
714
 
715
- except Exception as e: # pylint:disable=broad-exception-caught
716
- logger.error('Unexpected error in chapter parser: %s', str(e))
717
- return {'error': f'Unexpected error: {str(e)}'}
 
14
  from selenium.webdriver.support import expected_conditions as EC
15
  from selenium.webdriver.chrome.options import Options
16
  from selenium.common.exceptions import TimeoutException, WebDriverException
17
+ from functions.tool_helper_functions import (
18
+ libretext_book_parser,
19
+ libretext_chapter_parser,
20
+ save_libretext_book_as_markdown
21
+ )
22
 
23
  # Get logger for this module
24
  logger = logging.getLogger(__name__)
 
387
  return html
388
 
389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  @tool
391
  def libretext_book_search(query: str) -> dict:
392
  """
 
537
  except Exception as e: # pylint:disable=broad-exception-caught
538
  logger.warning('Error closing driver: %s', str(e))
539
 
540
+
541
  @tool
542
+ def get_libretext_book(url: str) -> dict:
543
  """
544
+ Get the complete content of a LibreTexts book including all chapters and sections.
545
 
546
  Args:
547
+ url (str): The URL of the LibreTexts book page.
548
 
549
  Returns:
550
+ dict: A dictionary containing the complete book structure in the following format.
551
+ {
552
+ 'title': 'book title string',
553
+ 'chapters': {
554
+ 'Chapter title': {
555
+ 'sections': {
556
+ 'Section title': {
557
+ 'Section summary': 'Section summary string',
558
+ 'Section url': 'https://example.com/section-url',
559
+ },
560
+ ...
561
+ }
562
+ },
563
+ ...
564
+ }
565
+ }
566
  """
567
 
568
+ logger.info('Getting complete LibreTexts book: %s', url)
569
 
570
+ # First, get the book structure (chapters)
571
+ book_data = libretext_book_parser(url)
 
 
 
 
 
 
 
 
572
 
573
+ if 'error' in book_data:
574
+ logger.error('Failed to parse book structure: %s', book_data['error'])
575
+ return book_data
 
576
 
577
+ # Extract book title from URL or use a default
578
+ book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ')
579
+ if '(' in book_title:
580
+ book_title = book_title.split('(')[0].strip()
581
 
582
+ # Initialize the complete book structure
583
+ complete_book = {
584
+ 'title': book_title,
585
+ 'chapters': {}
586
+ }
587
 
588
+ logger.info('Found %d chapters to process', len(book_data))
589
 
590
+ # Process each chapter
591
+ for chapter_info in book_data.values():
592
+ chapter_title = chapter_info['title']
593
+ chapter_url = chapter_info['url']
594
 
595
+ logger.info('Processing chapter: %s', chapter_title)
 
 
596
 
597
+ # Get sections for this chapter
598
+ sections_data = libretext_chapter_parser(chapter_url)
 
 
599
 
600
+ # Initialize chapter structure
601
+ complete_book['chapters'][chapter_title] = {
602
+ 'sections': {}
603
+ }
604
 
605
+ if 'error' in sections_data:
606
+ logger.warning('Failed to parse sections for chapter "%s": %s',
607
+ chapter_title, sections_data['error'])
608
+ complete_book['chapters'][chapter_title]['sections']['Error'] = {
609
+ 'Section summary': f"Failed to parse sections: {sections_data['error']}",
610
+ 'Section url': chapter_url
611
+ }
612
+ else:
613
+ # Process each section
614
+ for section_info in sections_data.values():
615
+ section_title = section_info['title']
616
+ section_url = section_info['url']
617
+ section_description = section_info['description']
618
 
619
+ complete_book['chapters'][chapter_title]['sections'][section_title] = {
620
+ 'Section summary': section_description,
621
+ 'Section url': section_url
622
+ }
623
 
624
+ logger.debug('Added section: %s', section_title)
 
 
 
625
 
626
+ logger.info('Successfully processed %d sections for chapter "%s"',
627
+ len(sections_data), chapter_title)
 
 
 
 
 
628
 
629
+ logger.info('Successfully compiled complete book with %d chapters',
630
+ len(complete_book['chapters']))
 
631
 
632
+ save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url)
 
 
633
 
634
+ return complete_book
 
 
635
 
 
 
 
tests/test_tools.py CHANGED
@@ -6,6 +6,9 @@ from functions.tools import (
6
  wikipedia_search,
7
  get_wikipedia_page,
8
  libretext_book_search,
 
 
 
9
  libretext_book_parser,
10
  libretext_chapter_parser
11
  )
@@ -256,5 +259,76 @@ class TestLibretextChapterParser(unittest.TestCase):
256
  # At least half the sections should have descriptions
257
  self.assertGreater(sections_with_descriptions, len(self.parse_results) // 2)
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  if __name__ == '__main__':
260
  unittest.main()
 
6
  wikipedia_search,
7
  get_wikipedia_page,
8
  libretext_book_search,
9
+ get_libretext_book
10
+ )
11
+ from functions.tool_helper_functions import (
12
  libretext_book_parser,
13
  libretext_chapter_parser
14
  )
 
259
  # At least half the sections should have descriptions
260
  self.assertGreater(sections_with_descriptions, len(self.parse_results) // 2)
261
 
262
+
263
+ class TestGetLibretextBook(unittest.TestCase):
264
+ '''Tests for the get_libretext_book tool.'''
265
+
266
+ def setUp(self):
267
+ # Use a smaller LibreTexts book for testing to avoid long test times
268
+ # This is a smaller book that should have fewer chapters
269
+ book_url = 'https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/Introductory_Chemistry_(CK-12)'
270
+
271
+ # For testing, we'll limit to just the first chapter to keep test times reasonable
272
+ # In a real scenario, you'd process the full book
273
+ self.book_results = get_libretext_book(book_url)
274
+
275
+ def test_result_type(self):
276
+ '''Book results should be a dictionary.'''
277
+ self.assertIsInstance(self.book_results, dict)
278
+
279
+ def test_no_error(self):
280
+ '''Book results should not contain an error at the top level.'''
281
+ self.assertNotIn('error', self.book_results)
282
+
283
+ def test_book_structure(self):
284
+ '''Book should have title and chapters structure.'''
285
+ if 'error' not in self.book_results:
286
+ self.assertIn('title', self.book_results)
287
+ self.assertIn('chapters', self.book_results)
288
+ self.assertIsInstance(self.book_results['title'], str)
289
+ self.assertIsInstance(self.book_results['chapters'], dict)
290
+
291
+ def test_chapters_exist(self):
292
+ '''Book should contain at least some chapters.'''
293
+ if 'error' not in self.book_results and 'chapters' in self.book_results:
294
+ self.assertGreater(len(self.book_results['chapters']), 0)
295
+
296
+ def test_chapter_structure(self):
297
+ '''Each chapter should have sections structure.'''
298
+ if ('error' not in self.book_results and
299
+ 'chapters' in self.book_results and
300
+ len(self.book_results['chapters']) > 0):
301
+
302
+ # Test the first chapter
303
+ first_chapter = next(iter(self.book_results['chapters'].values()))
304
+ self.assertIn('sections', first_chapter)
305
+ self.assertIsInstance(first_chapter['sections'], dict)
306
+
307
+ def test_section_structure(self):
308
+ '''Each section should have summary and url.'''
309
+ if ('error' not in self.book_results and
310
+ 'chapters' in self.book_results and
311
+ len(self.book_results['chapters']) > 0):
312
+
313
+ # Test the first chapter's first section
314
+ first_chapter = next(iter(self.book_results['chapters'].values()))
315
+ if 'sections' in first_chapter and len(first_chapter['sections']) > 0:
316
+ first_section = next(iter(first_chapter['sections'].values()))
317
+ self.assertIn('Section summary', first_section)
318
+ self.assertIn('Section url', first_section)
319
+ self.assertIsInstance(first_section['Section summary'], str)
320
+ self.assertIsInstance(first_section['Section url'], str)
321
+
322
+ def test_meaningful_content(self):
323
+ '''Book should have meaningful title and content.'''
324
+ if 'error' not in self.book_results:
325
+ # Title should be meaningful
326
+ self.assertTrue(len(self.book_results.get('title', '')) > 3)
327
+
328
+ # Should have chapters with meaningful names
329
+ if 'chapters' in self.book_results:
330
+ for chapter_title in self.book_results['chapters'].keys():
331
+ self.assertTrue(len(chapter_title) > 2)
332
+
333
  if __name__ == '__main__':
334
  unittest.main()