File size: 12,694 Bytes
e1896bb
2e1a616
a5e5840
7e55583
8b358c4
2e1a616
 
7e55583
a5e5840
 
 
 
 
 
75bb385
 
 
b4e2809
 
75bb385
7e55583
 
 
 
2e1a616
 
8b358c4
2e1a616
 
 
 
 
 
 
8b358c4
 
2e1a616
 
8b358c4
a5e5840
2e1a616
8b358c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5e5840
 
8b358c4
 
 
a5e5840
8b358c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e55583
 
8b358c4
 
 
 
 
 
 
 
 
 
 
 
a5e5840
 
 
 
b4e2809
a5e5840
 
 
8b358c4
 
 
 
a5e5840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3517095
 
a5e5840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3517095
a5e5840
 
 
 
 
 
 
 
3517095
a5e5840
3517095
75bb385
3517095
75bb385
3517095
75bb385
3517095
 
75bb385
3517095
 
75bb385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3517095
 
75bb385
3517095
75bb385
 
3517095
75bb385
 
 
3517095
75bb385
 
 
 
3517095
75bb385
 
 
 
 
3517095
75bb385
3517095
75bb385
 
 
 
3517095
75bb385
3517095
75bb385
 
3517095
75bb385
 
 
 
3517095
75bb385
 
 
 
 
 
 
 
 
 
 
 
 
3517095
75bb385
 
 
 
3517095
75bb385
3517095
75bb385
 
3517095
75bb385
 
3517095
75bb385
3517095
75bb385
3517095
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
'''Tools for GAIA question answering agent.'''

import time
import logging
import requests
from smolagents import tool
from googlesearch import search
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
from functions.tool_helper_functions import (
    libretext_book_parser,
    libretext_chapter_parser,
    save_libretext_book_as_markdown,
    WikipediaFetcher
)

# Get logger for this module
logger = logging.getLogger(__name__)


@tool
def google_search(query: str) -> dict:
    """
    Perform a Google search and return the top 10 results.
    
    Args:
        query (str): The search query.
        
    Returns:
        dict: A dictionary containing the search results in the following format.
        {0: {'title': str, 'url': str, 'description': str}, ...}
    """

    # Run the query
    results = list(search(query, num_results=10, advanced=True))

    # Parse and format the results
    parsed_results = {}

    for i, result in enumerate(results):

        parsed_results[i] = {
            'title': result.title,
            'url': result.url,
            'description': result.description
        }

    return parsed_results


@tool
def wikipedia_search(query: str) -> dict:
    """
    Perform a search for wikipedia pages and return the top 5 results.
    
    Args:
        query (str): The search query.
        
    Returns:
        dict: A dictionary containing the search results in the following format.
        {0: {'title': str, 'description': str}, ...}
    """

    repo_url = 'https://github.com/gperdrizet/unit-four-final-project'

    language_code = 'en'
    number_of_results = 5
    headers = {
        'User-Agent': f'HuggingFace Agents course final project ({repo_url})'
    }

    base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
    endpoint = '/search/page'
    url = base_url + language_code + endpoint
    parameters = {'q': query, 'limit': number_of_results}
    response = requests.get(url, headers=headers, params=parameters, timeout=15)

    if response.status_code == 200:
        results = response.json().get('pages', [])
        parsed_results = {}

    else:
        return f"Error: Unable to retrieve page. Status code {response.status_code}"

    for i, result in enumerate(results):

        parsed_results[i] = {
            'title': result.get('title', None),
            'description': result.get('description', None)
        }

    return parsed_results


@tool
def get_wikipedia_page(query: str) -> str:
    """
    Get the content of a Wikipedia page as HTML. Use this tool when trying to
    retrieve information from a Wikipedia page or article.

    Args:
        query (str): The title of the Wikipedia page.
        
    Returns:
        str: The HTML content of the Wikipedia page.
    """

    fetcher = WikipediaFetcher()
    html_result = fetcher.fetch(query.replace(' ', '_'))

    content = html_result['content']

    content = content.split(
        '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
    )[0]

    content = content.split(
        '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
    )[0]

    return content


@tool
def libretext_book_search(query: str) -> dict:
    """
    Search for LibreTexts books using Selenium to handle JavaScript-rendered content.
    
    Args:
        query (str): The search query.
        
    Returns:
        dict: A dictionary containing the search results in the following format.
        {0: {'title': str, 'url': str, 'description': str}, ...}
    """

    # Configure Chrome options for headless mode
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

    driver = None
    try:
        # Initialize the Chrome driver
        driver = webdriver.Chrome(options=chrome_options)

        # Construct search URL
        search_url = 'https://chem.libretexts.org/Special:Search'
        params = {
            'qid': '',
            'fpid': '230',
            'fpth': '',
            'query': query
        }

        # Build URL with parameters
        param_string = '&'.join([f"{k}={v}" for k, v in params.items()])
        full_url = f"{search_url}?{param_string}"

        logger.info('Selenium search URL: %s', full_url)

        # Navigate to the search page
        driver.get(full_url)

        # Wait for the search results to load
        # Wait for either search results or an indication that search is complete
        wait = WebDriverWait(driver, 15)

        try:
            # Wait for the search results container to be present and have content
            # or for a specific search result element to appear
            _ = wait.until(
                EC.presence_of_element_located((By.ID, "mt-search-spblls"))
            )

            # Give additional time for JavaScript to populate results
            time.sleep(3)

            # Get the page source after JavaScript execution
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Look for search results using multiple possible selectors
            search_info_divs = soup.find_all('div', class_='mt-search-information')

            # If no results with that class, try other common search result patterns
            if not search_info_divs:
                # Try alternative selectors that might be used for search results
                search_info_divs = soup.find_all('div', class_='search-result')
                if not search_info_divs:
                    search_info_divs = soup.find_all('div', class_='result')
                if not search_info_divs:
                    # Look for any divs within the search results container
                    results_container = soup.find('div', id='mt-search-spblls')
                    if results_container:
                        search_info_divs = results_container.find_all('div', recursive=False)

            logger.info('Found %d potential search result divs', len(search_info_divs))

            # Parse the search results
            parsed_results = {}
            result_count = 0

            for div in search_info_divs:
                # Try to extract title and URL from various possible structures
                title = None
                url = None
                summary = None

                # Look for title in anchor tags
                title_link = div.find('a')
                if title_link:
                    title = title_link.get_text(strip=True)
                    url = title_link.get('href', '')

                    # Make URL absolute if it's relative
                    if url and url.startswith('/'):
                        url = 'https://chem.libretexts.org' + url

                # Look for description/summary text
                # Try multiple approaches to find descriptive text
                text_elements = div.find_all(['p', 'span', 'div'])
                for element in text_elements:
                    text = element.get_text(strip=True)
                    if text and len(text) > 20 and not title or text != title:
                        summary = text
                        break

                # Only add to results if we have at least a title
                if title and len(title) > 3:  # Ensure title is meaningful
                    parsed_results[result_count] = {
                        'title': title,
                        'url': url or '',
                        'description': summary or ''
                    }

                    logger.debug(
                        'Extracted result %d: title="%s", url="%s"',
                        result_count,
                        title,
                        url
                    )

                    result_count += 1

            logger.info('Successfully extracted %d search results', len(parsed_results))
            return parsed_results

        except TimeoutException:
            logger.error('Timeout waiting for search results to load')
            return {'error': 'Timeout waiting for search results to load'}

    except WebDriverException as e:
        logger.error('WebDriver error: %s', str(e))
        return {'error': f'WebDriver error: {str(e)}'}

    except Exception as e: # pylint:disable=broad-exception-caught
        logger.error('Unexpected error in Selenium search: %s', str(e))
        return {'error': f'Unexpected error: {str(e)}'}

    finally:
        # Always clean up the driver
        if driver:
            try:
                driver.quit()
            except Exception as e: # pylint:disable=broad-exception-caught
                logger.warning('Error closing driver: %s', str(e))


@tool
def get_libretext_book(url: str) -> dict:
    """
    Get the complete content of a LibreTexts book including all chapters and sections.
    
    Args:
        url (str): The URL of the LibreTexts book page.
        
    Returns:
        dict: A dictionary containing the complete book structure in the following format.
        {
            'title': 'book title string',
            'chapters': {
                'Chapter title': {
                    'sections': {
                        'Section title': {
                            'Section summary': 'Section summary string',
                            'Section url': 'https://example.com/section-url',
                        },
                        ...
                    }
                },
                ...
            }
        }
    """

    logger.info('Getting complete LibreTexts book: %s', url)

    # First, get the book structure (chapters)
    book_data = libretext_book_parser(url)

    if 'error' in book_data:
        logger.error('Failed to parse book structure: %s', book_data['error'])
        return book_data

    # Extract book title from URL or use a default
    book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ')
    if '(' in book_title:
        book_title = book_title.split('(')[0].strip()

    # Initialize the complete book structure
    complete_book = {
        'title': book_title,
        'chapters': {}
    }

    logger.info('Found %d chapters to process', len(book_data))

    # Process each chapter
    for chapter_info in book_data.values():
        chapter_title = chapter_info['title']
        chapter_url = chapter_info['url']

        logger.info('Processing chapter: %s', chapter_title)

        # Get sections for this chapter
        sections_data = libretext_chapter_parser(chapter_url)

        # Initialize chapter structure
        complete_book['chapters'][chapter_title] = {
            'sections': {}
        }

        if 'error' in sections_data:
            logger.warning('Failed to parse sections for chapter "%s": %s', 
                         chapter_title, sections_data['error'])
            complete_book['chapters'][chapter_title]['sections']['Error'] = {
                'Section summary': f"Failed to parse sections: {sections_data['error']}",
                'Section url': chapter_url
            }
        else:
            # Process each section
            for section_info in sections_data.values():
                section_title = section_info['title']
                section_url = section_info['url']
                section_description = section_info['description']

                complete_book['chapters'][chapter_title]['sections'][section_title] = {
                    'Section summary': section_description,
                    'Section url': section_url
                }

                logger.debug('Added section: %s', section_title)

            logger.info('Successfully processed %d sections for chapter "%s"',
                       len(sections_data), chapter_title)

    logger.info('Successfully compiled complete book with %d chapters',
               len(complete_book['chapters']))

    save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url)

    return complete_book