File size: 19,384 Bytes
75bb385
 
b4e2809
75bb385
 
b4e2809
75bb385
b4e2809
75bb385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4e2809
75bb385
 
 
 
b4e2809
75bb385
 
 
 
 
 
 
 
 
 
 
 
 
b4e2809
75bb385
 
 
 
 
b4e2809
75bb385
 
 
 
 
 
 
 
 
 
 
b4e2809
75bb385
 
 
 
b4e2809
75bb385
 
 
 
b4e2809
75bb385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4e2809
75bb385
 
b4e2809
75bb385
b4e2809
75bb385
b4e2809
 
 
 
 
75bb385
 
 
 
 
b4e2809
75bb385
 
 
 
 
 
b4e2809
75bb385
 
 
 
 
b4e2809
75bb385
 
 
 
 
b4e2809
75bb385
 
 
 
 
b4e2809
75bb385
 
b4e2809
75bb385
 
 
 
 
 
 
 
 
 
b4e2809
75bb385
 
 
 
 
b4e2809
75bb385
b4e2809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
'''Helper functions for GAIA question answering agent tools.'''

import requests
import time
import logging
import bleach
from bs4 import BeautifulSoup
from bleach.css_sanitizer import CSSSanitizer

# Get logger for this module
logger = logging.getLogger(__name__)


def libretext_book_parser(url: str) -> dict:
    """
    Parse the content of a LibreTexts book and return table of contents as JSON.
    
    Args:
        url (str): The URL of the LibreTexts book page.
        
    Returns:
        dict: A dictionary containing the table of contents in the following format.
        {0: {'title': str, 'url': str, 'description': str}, ...}
    """

    logger.info('Parsing LibreTexts book: %s', url)

    # Set up headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
            '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

    try:
        # Fetch the book page
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for the table of contents structure
        # LibreTexts books typically use li elements with class 'mt-sortable-listing'
        chapter_listings = soup.find_all('li', class_='mt-sortable-listing')

        logger.info('Found %d potential chapter listings', len(chapter_listings))

        parsed_chapters = {}
        chapter_count = 0

        for listing in chapter_listings:

            # Extract the link element
            link = listing.find('a', class_='mt-sortable-listing-link')

            if link:

                # Extract title from the span with class 'mt-sortable-listing-title'
                title_span = link.find('span', class_='mt-sortable-listing-title')
                title = title_span.get_text(strip=True) if title_span else ''

                # Extract URL from href attribute
                chapter_url = link.get('href', '')

                # Extract description from the title attribute of the link
                description = link.get('title', '')

                # Clean up description - remove the title prefix if it appears
                if description and title and description.startswith(title):
                    description = description[len(title):].strip()

                    if description.startswith(':'):
                        description = description[1:].strip()

                # Only add meaningful chapters (skip empty titles or very short ones)
                if title and len(title) > 2:

                    parsed_chapters[chapter_count] = {
                        'title': title,
                        'url': chapter_url,
                        'description': description
                    }

                    logger.debug('Extracted chapter %d: title="%s", url="%s"', 
                               chapter_count, title, chapter_url)
                    chapter_count += 1

        logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))

        return parsed_chapters

    except requests.exceptions.RequestException as e:
        logger.error('Request error while fetching book page: %s', str(e))

        return {'error': f'Request error: {str(e)}'}

    except Exception as e: # pylint:disable=broad-exception-caught
        logger.error('Unexpected error in book parser: %s', str(e))

        return {'error': f'Unexpected error: {str(e)}'}


def libretext_chapter_parser(url: str) -> dict:
    """
    Parse the content of a LibreTexts chapter and return section headings as JSON.
    
    Args:
        url (str): The URL of the LibreTexts chapter page.
        
    Returns:
        dict: A dictionary containing the section headings in the following format.
        {0: {'title': str, 'url': str, 'description': str}, ...}
    """

    logger.info('Parsing LibreTexts chapter: %s', url)

    # Set up headers to mimic a real browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' +
            '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }

    try:
        # Fetch the chapter page
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Look for the section structure
        # LibreTexts chapters typically use li elements with class 'mt-list-topics'
        section_listings = soup.find_all('li', class_='mt-list-topics')

        logger.info('Found %d potential section listings', len(section_listings))

        parsed_sections = {}
        section_count = 0

        for listing in section_listings:
            # Look for the detailed listing structure
            dl_element = listing.find('dl', class_='mt-listing-detailed')

            if dl_element:
                # Extract title and URL from the dt element
                dt_element = dl_element.find('dt', class_='mt-listing-detailed-title')
                dd_element = dl_element.find('dd', class_='mt-listing-detailed-overview')

                if dt_element:
                    # Find the anchor tag within the dt element
                    link = dt_element.find('a')

                    if link:
                        # Extract title from the link text
                        title = link.get_text(strip=True)

                        # Extract URL from href attribute
                        section_url = link.get('href', '')

                        # Extract description from the dd element
                        description = ''
                        if dd_element:
                            description = dd_element.get_text(strip=True)

                        # Only add meaningful sections (skip empty titles or very short ones)
                        if title and len(title) > 2:
                            parsed_sections[section_count] = {
                                'title': title,
                                'url': section_url,
                                'description': description
                            }

                            logger.debug('Extracted section %d: title="%s", url="%s"', 
                                       section_count, title, section_url)
                            section_count += 1

        logger.info('Successfully extracted %d sections from chapter', len(parsed_sections))
        return parsed_sections

    except requests.exceptions.RequestException as e:
        logger.error('Request error while fetching chapter page: %s', str(e))
        return {'error': f'Request error: {str(e)}'}

    except Exception as e:  # pylint:disable=broad-exception-caught
        logger.error('Unexpected error in chapter parser: %s', str(e))
        return {'error': f'Unexpected error: {str(e)}'}


def save_libretext_book_as_markdown(book_data: dict, filename: str = None, source_url: str = None) -> str:
    """
    Save a complete LibreTexts book dictionary as a markdown formatted file.
    
    Args:
        book_data (dict): The complete book data dictionary from get_libretext_book().
        filename (str, optional): The filename to save the markdown. If not provided,
                                 will generate based on book title.
        source_url (str, optional): The original URL of the book for reference in the markdown.
        
    Returns:
        str: A message indicating success or failure with the filename used.
    """

    logger.info('Saving LibreTexts book as markdown')

    if 'error' in book_data:
        error_msg = f"Cannot save book with error: {book_data['error']}"
        logger.error(error_msg)
        return error_msg

    # Generate filename if not provided
    if filename is None:
        book_title = book_data.get('title', 'LibreTexts_Book')
        # Clean up the title for use as filename
        safe_title = "".join(c for c in book_title if c.isalnum() or c in (' ', '-', '_')).rstrip()
        safe_title = safe_title.replace(' ', '_')
        filename = f"{safe_title}.md"

    # Ensure .md extension
    if not filename.endswith('.md'):
        filename += '.md'

    try:
        # Format the book data as markdown
        markdown_content = []

        # Book title
        book_title = book_data.get('title', 'LibreTexts Book')
        markdown_content.append(f"# {book_title}\n")
        if source_url:
            markdown_content.append(f"*Extracted from: {source_url}*\n")
        markdown_content.append(f"*Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n\n")

        # Table of contents
        chapters = book_data.get('chapters', {})

        if chapters:
            markdown_content.append("## Table of Contents\n")

            for chapter_title in chapters.keys():

                # Create anchor link for the chapter
                anchor = chapter_title.lower().replace(
                    ' ',
                    '-'
                ).replace(':', '').replace('(', '').replace(')', '')

                markdown_content.append(f"- [{chapter_title}](#{anchor})\n")
            markdown_content.append("\n---\n\n")

        # Chapter content
        for chapter_title, chapter_data in chapters.items():

            # Chapter heading
            markdown_content.append(f"## {chapter_title}\n\n")

            sections = chapter_data.get('sections', {})

            if not sections:

                markdown_content.append("*No sections found for this chapter.*\n\n")
                continue

            # Section content
            for section_title, section_data in sections.items():

                # Section heading
                markdown_content.append(f"### {section_title}\n\n")

                # Section URL
                section_url = section_data.get('Section url', '')

                if section_url:
                    markdown_content.append(f"**URL:** [{section_url}]({section_url})\n\n")

                # Section summary
                section_summary = section_data.get('Section summary', '')

                if section_summary:
                    markdown_content.append(f"{section_summary}\n\n")

                    markdown_content.append("*No summary available.*\n\n")

                markdown_content.append("---\n\n")

        # Write to file
        with open(filename, 'w', encoding='utf-8') as f:
            f.writelines(markdown_content)

        success_msg = f"Successfully saved LibreTexts book as markdown file: {filename}"
        logger.info(success_msg)

        return success_msg

    except Exception as e:  # pylint:disable=broad-exception-caught
        error_msg = f"Error saving markdown file: {str(e)}"
        logger.error(error_msg)

        return error_msg


class WikipediaFetcher:
    """Gets and cleans up Wikipedia pages."""

    def fetch(self, page_name):
        """
        Passed a Wikipedia page's URL fragment, like
        'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
        main contents, tidy the HTML, strip out any elements we don't want
        and return the final HTML string.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        result = self._get_html(page_name)

        if result["success"]:
            result["content"] = self._tidy_html(result["content"])

        return result


    def _get_html(self, page_name):
        """
        Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
        the HTML content (not the entire HTML page) and returns it.

        Returns a dict with two elements:
            'success' is either True or, if we couldn't fetch the page, False.
            'content' is the HTML if success==True, or else an error message.
        """
        error_message = ""

        url = f"https://en.wikipedia.org/wiki/{page_name}"

        try:
            response = requests.get(url, params={"action": "render"}, timeout=5)
        except requests.exceptions.ConnectionError:
            error_message = "Can't connect to domain."
        except requests.exceptions.Timeout:
            error_message = "Connection timed out."
        except requests.exceptions.TooManyRedirects:
            error_message = "Too many redirects."

        try:
            response.raise_for_status()
        except requests.exceptions.HTTPError:
            # 4xx or 5xx errors:
            error_message = f"HTTP Error: {response.status_code}"
        except NameError:
            if error_message == "":
                error_message = "Something unusual went wrong."

        if error_message:
            return {"success": False, "content": error_message}
        else:
            return {"success": True, "content": response.text}


    def _tidy_html(self, html):
        """
        Passed the raw Wikipedia HTML, this returns valid HTML, with all
        disallowed elements stripped out.
        """
        html = self._bleach_html(html)
        html = self._strip_html(html)
        return html


    def _bleach_html(self, html):
        """
        Ensures we have valid HTML; no unclosed or mis-nested tags.
        Removes any tags and attributes we don't want to let through.
        Doesn't remove the contents of any disallowed tags.

        Pass it an HTML string, it'll return the bleached HTML string.
        """

        # Pretty much most elements, but no forms or audio/video.
        allowed_tags = {
            "a",
            "abbr",
            "acronym",
            "address",
            "area",
            "article",
            "b",
            "blockquote",
            "br",
            "caption",
            "cite",
            "code",
            "col",
            "colgroup",
            "dd",
            "del",
            "dfn",
            "div",
            "dl",
            "dt",
            "em",
            "figcaption",
            "figure",
            "footer",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "header",
            "hgroup",
            "hr",
            "i",
            "img",
            "ins",
            "kbd",
            "li",
            "map",
            "nav",
            "ol",
            "p",
            "pre",
            "q",
            "s",
            "samp",
            "section",
            "small",
            "span",
            "strong",
            "sub",
            "sup",
            "table",
            "tbody",
            "td",
            "tfoot",
            "th",
            "thead",
            "time",
            "tr",
            "ul",
            "var",
            # We allow script and style here, so we can close/un-mis-nest
            # its tags, but then it's removed completely in _strip_html():
            "script",
            "style",
        }

        # These attributes will not be removed from any of the allowed tags.
        allowed_attributes = {
            "*": ["class", "id"],
            "a": ["href", "title"],
            "abbr": ["title"],
            "acronym": ["title"],
            "img": ["alt", "src", "srcset"],
            # Ugh. Don't know why this page doesn't use .tright like others
            # http://127.0.0.1:8000/encyclopedia/5040/
            "table": ["align"],
            "td": ["colspan", "rowspan", "style"],
            "th": ["colspan", "rowspan", "scope"],
        }

        # These CSS properties are allowed within style attributes
        # Added for the family tree on /encyclopedia/5825/
        # Hopefully doesn't make anything else too hideous.
        allowed_css_properties = [
            "background",
            "border",
            "border-bottom",
            "border-collapse",
            "border-left",
            "border-radius",
            "border-right",
            "border-spacing",
            "border-top",
            "height",
            "padding",
            "text-align",
            "width",
        ]

        css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)

        a = bleach.clean(
            html,
            tags=allowed_tags,
            attributes=allowed_attributes,
            css_sanitizer=css_sanitizer,
            strip=True,
        )

        return a


    def _strip_html(self, html):
        """
        Takes out any tags, and their contents, that we don't want at all.
        And adds custom classes to existing tags (so we can apply CSS styles
        without having to multiply our CSS).

        Pass it an HTML string, it returns the stripped HTML string.
        """

        # CSS selectors. Strip these and their contents.
        selectors = [
            "div.hatnote",
            "div.navbar.mini",  # Will also match div.mini.navbar
            # Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
            "div.topicon",
            "a.mw-headline-anchor",
            "script",
            "style",
        ]

        # Strip any element that has one of these classes.
        classes = [
            # "This article may be expanded with text translated from..."
            # https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
            "ambox-notice",
            "magnify",
            # eg audio on https://en.wikipedia.org/wiki/Bagpipes
            "mediaContainer",
            "navbox",
            "noprint",
        ]

        # Any element has a class matching a key, it will have the classes
        # in the value added.
        add_classes = {
            # Give these tables standard Bootstrap styles.
            "infobox": ["table", "table-bordered"],
            "ambox": ["table", "table-bordered"],
            "wikitable": ["table", "table-bordered"],
        }

        soup = BeautifulSoup(html, "lxml")

        for selector in selectors:
            _ = [tag.decompose() for tag in soup.select(selector)]

        for clss in classes:
            _ = [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]

        for clss, new_classes in add_classes.items():
            for tag in soup.find_all(attrs={"class": clss}):
                tag["class"] = tag.get("class", []) + new_classes

        # Depending on the HTML parser BeautifulSoup used, soup may have
        # surrounding <html><body></body></html> or just <body></body> tags.
        if soup.body:
            soup = soup.body
        elif soup.html:
            soup = soup.html.body

        # Put the content back into a string.
        html = "".join(str(tag) for tag in soup.contents)

        return html