gperdrizet commited on
Commit
a5e5840
·
verified ·
1 Parent(s): 7e55583

Added custom selenium based web search functions to look for academic text books via LibreTexts.

Browse files
Files changed (5) hide show
  1. app.py +5 -4
  2. functions/agent.py +6 -2
  3. functions/tools.py +194 -6
  4. requirements.txt +1 -0
  5. tests/test_tools.py +49 -3
app.py CHANGED
@@ -255,7 +255,6 @@ with gr.Blocks() as demo:
255
  )
256
 
257
  if __name__ == "__main__":
258
- logger.info("\n" + "-"*30 + " App Starting " + "-"*30)
259
 
260
  # Check for SPACE_HOST and SPACE_ID at startup for information
261
  space_host_startup = os.getenv("SPACE_HOST")
@@ -270,14 +269,16 @@ if __name__ == "__main__":
270
  if space_id_startup: # Print repo URLs if SPACE_ID is found
271
  logger.info("✅ SPACE_ID found: %s", space_id_startup)
272
  logger.info(" Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
273
- logger.info(" Repo Tree URL: https://huggingface.co/spaces/%s/tree/main", space_id_startup)
 
 
 
 
274
  else:
275
  logger.info(
276
  "ℹ️ SPACE_ID environment variable not found (running locally?). " \
277
  "Repo URL cannot be determined."
278
  )
279
 
280
- logger.info("-" + "-"*(60 + len(" App Starting ")) + "\n")
281
-
282
  logger.info("Launching Gradio Interface for Basic Agent Evaluation...")
283
  demo.launch(debug=True, share=False)
 
255
  )
256
 
257
  if __name__ == "__main__":
 
258
 
259
  # Check for SPACE_HOST and SPACE_ID at startup for information
260
  space_host_startup = os.getenv("SPACE_HOST")
 
269
  if space_id_startup: # Print repo URLs if SPACE_ID is found
270
  logger.info("✅ SPACE_ID found: %s", space_id_startup)
271
  logger.info(" Repo URL: https://huggingface.co/spaces/%s", space_id_startup)
272
+ logger.info(
273
+ " Repo Tree URL: https://huggingface.co/spaces/%s/tree/main",
274
+ space_id_startup
275
+ )
276
+
277
  else:
278
  logger.info(
279
  "ℹ️ SPACE_ID environment variable not found (running locally?). " \
280
  "Repo URL cannot be determined."
281
  )
282
 
 
 
283
  logger.info("Launching Gradio Interface for Basic Agent Evaluation...")
284
  demo.launch(debug=True, share=False)
functions/agent.py CHANGED
@@ -82,7 +82,10 @@ def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
82
  new_messages = [agent.memory.steps[-1].model_input_messages[0]]
83
  new_messages.append({
84
  'role': MessageRole.USER,
85
- 'content': [{'type': 'text', 'text': f'Here is a summary of your investigation so far: {summary}'}]
 
 
 
86
  })
87
  agent.memory.steps = [agent.memory.steps[0]]
88
  agent.memory.steps[0].model_input_messages = new_messages
@@ -107,7 +110,8 @@ def summarize_old_messages(messages: dict) -> dict:
107
  messages = [
108
  {
109
  'role': 'system',
110
- 'content': f'Summarize the following interaction between an AI agent and a user. Return the summary formatted as text, not as JSON: {json.dumps(messages)}'
 
111
  }
112
  ]
113
 
 
82
  new_messages = [agent.memory.steps[-1].model_input_messages[0]]
83
  new_messages.append({
84
  'role': MessageRole.USER,
85
+ 'content': [{
86
+ 'type': 'text',
87
+ 'text': f'Here is a summary of your investigation so far: {summary}'
88
+ }]
89
  })
90
  agent.memory.steps = [agent.memory.steps[0]]
91
  agent.memory.steps[0].model_input_messages = new_messages
 
110
  messages = [
111
  {
112
  'role': 'system',
113
+ 'content': ('Summarize the following interaction between an AI agent and a user. ' +
114
+ f'Return the summary formatted as text, not as JSON: {json.dumps(messages)}')
115
  }
116
  ]
117
 
functions/tools.py CHANGED
@@ -1,5 +1,6 @@
1
  '''Tools for GAIA question answering agent.'''
2
 
 
3
  import logging
4
  import bleach
5
  import requests
@@ -7,6 +8,12 @@ from bleach.css_sanitizer import CSSSanitizer
7
  from smolagents import tool
8
  from googlesearch import search
9
  from bs4 import BeautifulSoup
 
 
 
 
 
 
10
 
11
  # Get logger for this module
12
  logger = logging.getLogger(__name__)
@@ -26,7 +33,7 @@ def google_search(query: str) -> dict:
26
  """
27
 
28
  # Run the query
29
- results = list(search(query, num_results=5, advanced=True))
30
 
31
  # Parse and format the results
32
  parsed_results = {}
@@ -55,10 +62,12 @@ def wikipedia_search(query: str) -> dict:
55
  {0: {'title': str, 'description': str}, ...}
56
  """
57
 
 
 
58
  language_code = 'en'
59
  number_of_results = 5
60
  headers = {
61
- 'User-Agent': 'HuggingFace Agents course final project (https://github.com/gperdrizet/unit-four-final-project)'
62
  }
63
 
64
  base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
@@ -101,8 +110,14 @@ def get_wikipedia_page(query: str) -> str:
101
  html_result = fetcher.fetch(query.replace(' ', '_'))
102
 
103
  content = html_result['content']
104
- content = content.split('<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>')[0]
105
- content = content.split('<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>')[0]
 
 
 
 
 
 
106
 
107
  return content
108
 
@@ -345,10 +360,10 @@ class WikipediaFetcher:
345
  soup = BeautifulSoup(html, "lxml")
346
 
347
  for selector in selectors:
348
- [tag.decompose() for tag in soup.select(selector)]
349
 
350
  for clss in classes:
351
- [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
352
 
353
  for clss, new_classes in add_classes.items():
354
  for tag in soup.find_all(attrs={"class": clss}):
@@ -365,3 +380,176 @@ class WikipediaFetcher:
365
  html = "".join(str(tag) for tag in soup.contents)
366
 
367
  return html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''Tools for GAIA question answering agent.'''
2
 
3
+ import time
4
  import logging
5
  import bleach
6
  import requests
 
8
  from smolagents import tool
9
  from googlesearch import search
10
  from bs4 import BeautifulSoup
11
+ from selenium import webdriver
12
+ from selenium.webdriver.common.by import By
13
+ from selenium.webdriver.support.ui import WebDriverWait
14
+ from selenium.webdriver.support import expected_conditions as EC
15
+ from selenium.webdriver.chrome.options import Options
16
+ from selenium.common.exceptions import TimeoutException, WebDriverException
17
 
18
  # Get logger for this module
19
  logger = logging.getLogger(__name__)
 
33
  """
34
 
35
  # Run the query
36
+ results = list(search(query, num_results=10, advanced=True))
37
 
38
  # Parse and format the results
39
  parsed_results = {}
 
62
  {0: {'title': str, 'description': str}, ...}
63
  """
64
 
65
+ repo_url = 'https://github.com/gperdrizet/unit-four-final-project'
66
+
67
  language_code = 'en'
68
  number_of_results = 5
69
  headers = {
70
+ 'User-Agent': f'HuggingFace Agents course final project ({repo_url})'
71
  }
72
 
73
  base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
 
110
  html_result = fetcher.fetch(query.replace(' ', '_'))
111
 
112
  content = html_result['content']
113
+
114
+ content = content.split(
115
+ '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
116
+ )[0]
117
+
118
+ content = content.split(
119
+ '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
120
+ )[0]
121
 
122
  return content
123
 
 
360
  soup = BeautifulSoup(html, "lxml")
361
 
362
  for selector in selectors:
363
+ _ = [tag.decompose() for tag in soup.select(selector)]
364
 
365
  for clss in classes:
366
+ _ = [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
367
 
368
  for clss, new_classes in add_classes.items():
369
  for tag in soup.find_all(attrs={"class": clss}):
 
380
  html = "".join(str(tag) for tag in soup.contents)
381
 
382
  return html
383
+
384
+
385
+ @tool
386
+ def libretext_book_parser(url: str) -> str:
387
+ """
388
+ Parse the content of a LibreTexts book and return table of contents as JSON.
389
+
390
+ Args:
391
+ url (str): The URL of the LibreTexts book page.
392
+
393
+ Returns:
394
+ dict: A dictionary containing the table of contents in JSON format.
395
+ """
396
+
397
+ logger.debug(url)
398
+
399
+ return "LibreTexts book parser is not yet implemented."
400
+
401
+ @tool
402
+ def libretext_book_search(query: str) -> dict:
403
+ """
404
+ Search for LibreTexts books using Selenium to handle JavaScript-rendered content.
405
+
406
+ Args:
407
+ query (str): The search query.
408
+
409
+ Returns:
410
+ dict: A dictionary containing the search results in the following format.
411
+ {0: {'title': str, 'url': str, 'description': str}, ...}
412
+ """
413
+
414
+ # Configure Chrome options for headless mode
415
+ chrome_options = Options()
416
+ chrome_options.add_argument("--headless")
417
+ chrome_options.add_argument("--no-sandbox")
418
+ chrome_options.add_argument("--disable-dev-shm-usage")
419
+ chrome_options.add_argument("--disable-gpu")
420
+ chrome_options.add_argument("--window-size=1920,1080")
421
+ chrome_options.add_argument(
422
+ "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
423
+ "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
424
+ )
425
+
426
+ driver = None
427
+ try:
428
+ # Initialize the Chrome driver
429
+ driver = webdriver.Chrome(options=chrome_options)
430
+
431
+ # Construct search URL
432
+ search_url = 'https://chem.libretexts.org/Special:Search'
433
+ params = {
434
+ 'qid': '',
435
+ 'fpid': '230',
436
+ 'fpth': '',
437
+ 'query': query
438
+ }
439
+
440
+ # Build URL with parameters
441
+ param_string = '&'.join([f"{k}={v}" for k, v in params.items()])
442
+ full_url = f"{search_url}?{param_string}"
443
+
444
+ logger.info('Selenium search URL: %s', full_url)
445
+
446
+ # Navigate to the search page
447
+ driver.get(full_url)
448
+
449
+ # Wait for the search results to load
450
+ # Wait for either search results or an indication that search is complete
451
+ wait = WebDriverWait(driver, 15)
452
+
453
+ try:
454
+ # Wait for the search results container to be present and have content
455
+ # or for a specific search result element to appear
456
+ _ = wait.until(
457
+ EC.presence_of_element_located((By.ID, "mt-search-spblls"))
458
+ )
459
+
460
+ # Give additional time for JavaScript to populate results
461
+ time.sleep(3)
462
+
463
+ # Get the page source after JavaScript execution
464
+ page_source = driver.page_source
465
+ soup = BeautifulSoup(page_source, 'html.parser')
466
+
467
+ # Save the rendered HTML for debugging
468
+ with open('selenium_test.html', 'w', encoding='utf-8') as f:
469
+ f.write(soup.prettify())
470
+
471
+ # Look for search results using multiple possible selectors
472
+ search_info_divs = soup.find_all('div', class_='mt-search-information')
473
+
474
+ # If no results with that class, try other common search result patterns
475
+ if not search_info_divs:
476
+ # Try alternative selectors that might be used for search results
477
+ search_info_divs = soup.find_all('div', class_='search-result')
478
+ if not search_info_divs:
479
+ search_info_divs = soup.find_all('div', class_='result')
480
+ if not search_info_divs:
481
+ # Look for any divs within the search results container
482
+ results_container = soup.find('div', id='mt-search-spblls')
483
+ if results_container:
484
+ search_info_divs = results_container.find_all('div', recursive=False)
485
+
486
+ logger.info('Found %d potential search result divs', len(search_info_divs))
487
+
488
+ # Parse the search results
489
+ parsed_results = {}
490
+ result_count = 0
491
+
492
+ for div in search_info_divs:
493
+ # Try to extract title and URL from various possible structures
494
+ title = None
495
+ url = None
496
+ summary = None
497
+
498
+ # Look for title in anchor tags
499
+ title_link = div.find('a')
500
+ if title_link:
501
+ title = title_link.get_text(strip=True)
502
+ url = title_link.get('href', '')
503
+
504
+ # Make URL absolute if it's relative
505
+ if url and url.startswith('/'):
506
+ url = 'https://chem.libretexts.org' + url
507
+
508
+ # Look for description/summary text
509
+ # Try multiple approaches to find descriptive text
510
+ text_elements = div.find_all(['p', 'span', 'div'])
511
+ for element in text_elements:
512
+ text = element.get_text(strip=True)
513
+ if text and len(text) > 20 and not title or text != title:
514
+ summary = text
515
+ break
516
+
517
+ # Only add to results if we have at least a title
518
+ if title and len(title) > 3: # Ensure title is meaningful
519
+ parsed_results[result_count] = {
520
+ 'title': title,
521
+ 'url': url or '',
522
+ 'description': summary or ''
523
+ }
524
+
525
+ logger.debug(
526
+ 'Extracted result %d: title="%s", url="%s"',
527
+ result_count,
528
+ title,
529
+ url
530
+ )
531
+
532
+ result_count += 1
533
+
534
+ logger.info('Successfully extracted %d search results', len(parsed_results))
535
+ return parsed_results
536
+
537
+ except TimeoutException:
538
+ logger.error('Timeout waiting for search results to load')
539
+ return {'error': 'Timeout waiting for search results to load'}
540
+
541
+ except WebDriverException as e:
542
+ logger.error('WebDriver error: %s', str(e))
543
+ return {'error': f'WebDriver error: {str(e)}'}
544
+
545
+ except Exception as e: # pylint: disable=broad-exception-caught
546
+ logger.error('Unexpected error in Selenium search: %s', str(e))
547
+ return {'error': f'Unexpected error: {str(e)}'}
548
+
549
+ finally:
550
+ # Always clean up the driver
551
+ if driver:
552
+ try:
553
+ driver.quit()
554
+ except Exception as e: # pylint: disable=broad-exception-caught
555
+ logger.warning('Error closing driver: %s', str(e))
requirements.txt CHANGED
@@ -5,6 +5,7 @@ gradio[oauth]
5
  markdownify
6
  mwparserfromhell
7
  requests
 
8
  smolagents==1.18.0
9
  tinycss2
10
  wikipedia-api
 
5
  markdownify
6
  mwparserfromhell
7
  requests
8
+ selenium
9
  smolagents==1.18.0
10
  tinycss2
11
  wikipedia-api
tests/test_tools.py CHANGED
@@ -4,7 +4,8 @@ import unittest
4
  from functions.tools import (
5
  google_search,
6
  wikipedia_search,
7
- get_wikipedia_page
 
8
  )
9
 
10
 
@@ -27,13 +28,13 @@ class TestGoogleSearch(unittest.TestCase):
27
  def test_result_length(self):
28
  '''Search results should contain 5 items.'''
29
 
30
- self.assertEqual(len(self.search_results), 5)
31
 
32
 
33
  def test_result_content(self):
34
  '''Each search result should contain three elements: title, link, and snippet.'''
35
 
36
- for _, result in self.search_results.items():
37
  self.assertIsInstance(result, dict)
38
  self.assertIn('title', result)
39
  self.assertIn('url', result)
@@ -95,3 +96,48 @@ class TestGetWikipediaPage(unittest.TestCase):
95
  '''Page content should not be empty.'''
96
 
97
  self.assertTrue(len(self.page_content) > 0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from functions.tools import (
5
  google_search,
6
  wikipedia_search,
7
+ get_wikipedia_page,
8
+ libretext_book_search
9
  )
10
 
11
 
 
28
  def test_result_length(self):
29
  '''Search results should contain 5 items.'''
30
 
31
+ self.assertEqual(len(self.search_results), 10)
32
 
33
 
34
  def test_result_content(self):
35
  '''Each search result should contain three elements: title, link, and snippet.'''
36
 
37
+ for result in self.search_results.values():
38
  self.assertIsInstance(result, dict)
39
  self.assertIn('title', result)
40
  self.assertIn('url', result)
 
96
  '''Page content should not be empty.'''
97
 
98
  self.assertTrue(len(self.page_content) > 0)
99
+
100
+
101
+ class TestLibretextBookSearch(unittest.TestCase):
102
+ '''Tests for the libretext_book_search tool.'''
103
+
104
+ def setUp(self):
105
+ search_query = 'Introductory chemistry ck-12'
106
+ self.search_results = libretext_book_search(search_query)
107
+
108
+ def test_result_type(self):
109
+ '''Search results should be a dictionary.'''
110
+ self.assertIsInstance(self.search_results, dict)
111
+
112
+ def test_no_error(self):
113
+ '''Search results should not contain an error.'''
114
+ self.assertNotIn('error', self.search_results)
115
+
116
+ def test_result_content(self):
117
+ '''Each search result should contain title, url, and description if results found.'''
118
+ if len(self.search_results) > 0 and 'error' not in self.search_results:
119
+ for result in self.search_results.values():
120
+ self.assertIsInstance(result, dict)
121
+ self.assertIn('title', result)
122
+ self.assertIn('url', result)
123
+ self.assertIn('description', result)
124
+ self.assertIsInstance(result['title'], str)
125
+ self.assertIsInstance(result['url'], str)
126
+ self.assertIsInstance(result['description'], str)
127
+
128
+ def test_first_result_exists(self):
129
+ '''If results are found, the first result should have a meaningful title.'''
130
+ if len(self.search_results) > 0 and 'error' not in self.search_results:
131
+ first_result = next(iter(self.search_results.values()))
132
+ self.assertTrue(len(first_result['title']) > 3)
133
+
134
+ def test_result_urls_valid(self):
135
+ '''URLs should be properly formatted if present.'''
136
+ if len(self.search_results) > 0 and 'error' not in self.search_results:
137
+ for result in self.search_results.values():
138
+ if result['url']: # Only test non-empty URLs
139
+ self.assertTrue(
140
+ result['url'].startswith('http://') or
141
+ result['url'].startswith('https://') or
142
+ result['url'].startswith('/')
143
+ )