gperdrizet commited on
Commit
8b358c4
·
verified ·
1 Parent(s): 503c07a

Updated wikipedia search tools to get and parse wikipedia pages to HTML so that tables and other non-text elements are visible to agent. Allowed agent to import BeautifulSoup.

Browse files
Files changed (6) hide show
  1. app.py +5 -5
  2. configuration.py +6 -1
  3. functions/agent.py +23 -10
  4. functions/tools.py +345 -4
  5. requirements.txt +5 -2
  6. tests/test_tools.py +59 -4
app.py CHANGED
@@ -12,10 +12,10 @@ import pandas as pd
12
  from functions.agent import create_agent
13
 
14
  # --- Constants ---
15
- from configuration import DEFAULT_API_URL, INSTRUCTIONS
16
 
17
 
18
- def run_and_submit_all( profile: gr.OAuthProfile | None):
19
  """
20
  Fetches all questions, runs the BasicAgent on them, submits all answers,
21
  and displays the results.
@@ -79,7 +79,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
79
 
80
  print(f'Running agent on {len(questions_data)} questions...')
81
 
82
- for item in questions_data:
 
83
  task_id = item.get("task_id")
84
  question_text = item.get("question")
85
 
@@ -89,8 +90,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
89
 
90
  try:
91
  submitted_answer = agent.run(
92
- INSTRUCTIONS,
93
- additional_args={'user_prompt': question_text}
94
  )
95
 
96
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
 
12
  from functions.agent import create_agent
13
 
14
  # --- Constants ---
15
+ from configuration import QUESTIONS, DEFAULT_API_URL, INSTRUCTIONS
16
 
17
 
18
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
19
  """
20
  Fetches all questions, runs the BasicAgent on them, submits all answers,
21
  and displays the results.
 
79
 
80
  print(f'Running agent on {len(questions_data)} questions...')
81
 
82
+ for question_number in QUESTIONS:
83
+ item = questions_data[question_number - 1] # Adjust for zero-based index
84
  task_id = item.get("task_id")
85
  question_text = item.get("question")
86
 
 
90
 
91
  try:
92
  submitted_answer = agent.run(
93
+ question_text
 
94
  )
95
 
96
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
configuration.py CHANGED
@@ -5,8 +5,13 @@ Contains API URLs and agent instructions used throughout the application.
5
  """
6
  # pylint: disable=line-too-long
7
 
 
 
 
 
8
  DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
9
 
 
10
  INSTRUCTIONS = """
11
- YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
12
  """
 
5
  """
6
  # pylint: disable=line-too-long
7
 
8
+ # Which questions to answer
9
+ QUESTIONS = [1]
10
+
11
+ # GAIA benchmark scoring API
12
  DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
13
 
14
+ # Additional instructions for agent. See here: https://huggingface.co/spaces/gaia-benchmark/leaderboard
15
  INSTRUCTIONS = """
16
+ You are a general AI assistant. I will ask you a question. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
17
  """
functions/agent.py CHANGED
@@ -1,24 +1,37 @@
1
  '''Agent definition for GAIA question answering system.'''
2
 
3
  # Imports for agent creation
4
- from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool, Tool
5
- from langchain_community.agent_toolkits.load_tools import load_tools
6
- from functions.tools import google_search
 
 
 
7
 
8
  def create_agent():
9
  '''Creates agent for GAIA question answering system.'''
10
 
11
- wikipedia = Tool.from_langchain(
12
- load_tools(["wikipedia"])[0]
13
- )
14
-
15
  model = InferenceClientModel(
16
- "Qwen/Qwen2.5-Coder-32B-Instruct"
 
 
 
 
17
  )
18
 
 
 
 
 
 
 
 
19
  agent = CodeAgent(
20
- tools=[wikipedia, google_search, VisitWebpageTool()],
21
- model=model
 
 
 
22
  )
23
 
24
  return agent
 
1
  '''Agent definition for GAIA question answering system.'''
2
 
3
  # Imports for agent creation
4
+ from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool
5
+ from functions.tools import (
6
+ google_search,
7
+ wikipedia_search,
8
+ get_wikipedia_page
9
+ )
10
 
11
  def create_agent():
12
  '''Creates agent for GAIA question answering system.'''
13
 
 
 
 
 
14
  model = InferenceClientModel(
15
+ # max_tokens=8096,
16
+ # temperature=0.5,
17
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
18
+ provider='together'
19
+ # custom_role_conversions=None
20
  )
21
 
22
+ tools = [
23
+ google_search,
24
+ wikipedia_search,
25
+ get_wikipedia_page,
26
+ VisitWebpageTool()
27
+ ]
28
+
29
  agent = CodeAgent(
30
+ tools=tools,
31
+ model=model,
32
+ max_steps=20,
33
+ planning_interval=2,
34
+ additional_authorized_imports=['bs4.*']
35
  )
36
 
37
  return agent
functions/tools.py CHANGED
@@ -1,10 +1,14 @@
1
  '''Tools for GAIA question answering agent.'''
2
 
 
 
 
 
3
  from smolagents import tool
4
  from googlesearch import search
5
 
6
  @tool
7
- def google_search(query: str) -> str:
8
  """
9
  Perform a Google search and return the top 10 results.
10
 
@@ -12,9 +16,346 @@ def google_search(query: str) -> str:
12
  query (str): The search query.
13
 
14
  Returns:
15
- str: The URLs of the top search results, separated by newlines.
 
16
  """
17
 
18
- results = list(search(query, num_results=10, advanced=True))
 
19
 
20
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''Tools for GAIA question answering agent.'''
2
 
3
+ import bleach
4
+ import requests
5
+ from bleach.css_sanitizer import CSSSanitizer
6
+ from bs4 import BeautifulSoup
7
  from smolagents import tool
8
  from googlesearch import search
9
 
10
  @tool
11
+ def google_search(query: str) -> dict:
12
  """
13
  Perform a Google search and return the top 10 results.
14
 
 
16
  query (str): The search query.
17
 
18
  Returns:
19
+ dict: A dictionary containing the search results in the following format.
20
+ {0: {'title': str, 'url': str, 'description': str}, ...}
21
  """
22
 
23
+ # Run the query
24
+ results = list(search(query, num_results=5, advanced=True))
25
 
26
+ # Parse and format the results
27
+ parsed_results = {}
28
+
29
+ for i, result in enumerate(results):
30
+
31
+ parsed_results[i] = {
32
+ 'title': result.title,
33
+ 'url': result.url,
34
+ 'description': result.description
35
+ }
36
+
37
+ return parsed_results
38
+
39
+
40
+ @tool
41
+ def wikipedia_search(query: str) -> dict:
42
+ """
43
+ Perform a search for wikipedia pages and return the top 5 results.
44
+
45
+ Args:
46
+ query (str): The search query.
47
+
48
+ Returns:
49
+ dict: A dictionary containing the search results in the following format.
50
+ {0: {'title': str, 'description': str}, ...}
51
+ """
52
+
53
+ language_code = 'en'
54
+ number_of_results = 5
55
+ headers = {
56
+ 'User-Agent': 'HuggingFace Agents course final project (https://github.com/gperdrizet/unit-four-final-project)'
57
+ }
58
+
59
+ base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
60
+ endpoint = '/search/page'
61
+ url = base_url + language_code + endpoint
62
+ parameters = {'q': query, 'limit': number_of_results}
63
+ response = requests.get(url, headers=headers, params=parameters, timeout=15)
64
+
65
+ if response.status_code == 200:
66
+ results = response.json().get('pages', [])
67
+ parsed_results = {}
68
+
69
+ else:
70
+ return f"Error: Unable to retrieve page. Status code {response.status_code}"
71
+
72
+ for i, result in enumerate(results):
73
+
74
+ parsed_results[i] = {
75
+ 'title': result.get('title', None),
76
+ 'description': result.get('description', None)
77
+ }
78
+
79
+ return parsed_results
80
+
81
+
82
+ @tool
83
+ def get_wikipedia_page(query: str) -> str:
84
+ """
85
+ Get the content of a Wikipedia page as HTML.
86
+
87
+ Args:
88
+ query (str): The title of the Wikipedia page.
89
+
90
+ Returns:
91
+ str: The HTML content of the Wikipedia page.
92
+ """
93
+
94
+ fetcher = WikipediaFetcher()
95
+ html_result = fetcher.fetch(query.replace(' ', '_'))
96
+
97
+ content = html_result['content']
98
+ content = content.split('<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>')[0]
99
+ content = content.split('<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>')[0]
100
+
101
+ return content
102
+
103
+
104
+ class WikipediaFetcher:
105
+ """Gets and cleans up Wikipedia pages."""
106
+
107
+ def fetch(self, page_name):
108
+ """
109
+ Passed a Wikipedia page's URL fragment, like
110
+ 'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
111
+ main contents, tidy the HTML, strip out any elements we don't want
112
+ and return the final HTML string.
113
+
114
+ Returns a dict with two elements:
115
+ 'success' is either True or, if we couldn't fetch the page, False.
116
+ 'content' is the HTML if success==True, or else an error message.
117
+ """
118
+ result = self._get_html(page_name)
119
+
120
+ if result["success"]:
121
+ result["content"] = self._tidy_html(result["content"])
122
+
123
+ return result
124
+
125
+
126
+ def _get_html(self, page_name):
127
+ """
128
+ Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
129
+ the HTML content (not the entire HTML page) and returns it.
130
+
131
+ Returns a dict with two elements:
132
+ 'success' is either True or, if we couldn't fetch the page, False.
133
+ 'content' is the HTML if success==True, or else an error message.
134
+ """
135
+ error_message = ""
136
+
137
+ url = f"https://en.wikipedia.org/wiki/{page_name}"
138
+
139
+ try:
140
+ response = requests.get(url, params={"action": "render"}, timeout=5)
141
+ except requests.exceptions.ConnectionError:
142
+ error_message = "Can't connect to domain."
143
+ except requests.exceptions.Timeout:
144
+ error_message = "Connection timed out."
145
+ except requests.exceptions.TooManyRedirects:
146
+ error_message = "Too many redirects."
147
+
148
+ try:
149
+ response.raise_for_status()
150
+ except requests.exceptions.HTTPError:
151
+ # 4xx or 5xx errors:
152
+ error_message = f"HTTP Error: {response.status_code}"
153
+ except NameError:
154
+ if error_message == "":
155
+ error_message = "Something unusual went wrong."
156
+
157
+ if error_message:
158
+ return {"success": False, "content": error_message}
159
+ else:
160
+ return {"success": True, "content": response.text}
161
+
162
+
163
+ def _tidy_html(self, html):
164
+ """
165
+ Passed the raw Wikipedia HTML, this returns valid HTML, with all
166
+ disallowed elements stripped out.
167
+ """
168
+ html = self._bleach_html(html)
169
+ html = self._strip_html(html)
170
+ return html
171
+
172
+
173
+ def _bleach_html(self, html):
174
+ """
175
+ Ensures we have valid HTML; no unclosed or mis-nested tags.
176
+ Removes any tags and attributes we don't want to let through.
177
+ Doesn't remove the contents of any disallowed tags.
178
+
179
+ Pass it an HTML string, it'll return the bleached HTML string.
180
+ """
181
+
182
+ # Pretty much most elements, but no forms or audio/video.
183
+ allowed_tags = {
184
+ "a",
185
+ "abbr",
186
+ "acronym",
187
+ "address",
188
+ "area",
189
+ "article",
190
+ "b",
191
+ "blockquote",
192
+ "br",
193
+ "caption",
194
+ "cite",
195
+ "code",
196
+ "col",
197
+ "colgroup",
198
+ "dd",
199
+ "del",
200
+ "dfn",
201
+ "div",
202
+ "dl",
203
+ "dt",
204
+ "em",
205
+ "figcaption",
206
+ "figure",
207
+ "footer",
208
+ "h1",
209
+ "h2",
210
+ "h3",
211
+ "h4",
212
+ "h5",
213
+ "h6",
214
+ "header",
215
+ "hgroup",
216
+ "hr",
217
+ "i",
218
+ "img",
219
+ "ins",
220
+ "kbd",
221
+ "li",
222
+ "map",
223
+ "nav",
224
+ "ol",
225
+ "p",
226
+ "pre",
227
+ "q",
228
+ "s",
229
+ "samp",
230
+ "section",
231
+ "small",
232
+ "span",
233
+ "strong",
234
+ "sub",
235
+ "sup",
236
+ "table",
237
+ "tbody",
238
+ "td",
239
+ "tfoot",
240
+ "th",
241
+ "thead",
242
+ "time",
243
+ "tr",
244
+ "ul",
245
+ "var",
246
+ # We allow script and style here, so we can close/un-mis-nest
247
+ # its tags, but then it's removed completely in _strip_html():
248
+ "script",
249
+ "style",
250
+ }
251
+
252
+ # These attributes will not be removed from any of the allowed tags.
253
+ allowed_attributes = {
254
+ "*": ["class", "id"],
255
+ "a": ["href", "title"],
256
+ "abbr": ["title"],
257
+ "acronym": ["title"],
258
+ "img": ["alt", "src", "srcset"],
259
+ # Ugh. Don't know why this page doesn't use .tright like others
260
+ # http://127.0.0.1:8000/encyclopedia/5040/
261
+ "table": ["align"],
262
+ "td": ["colspan", "rowspan", "style"],
263
+ "th": ["colspan", "rowspan", "scope"],
264
+ }
265
+
266
+ # These CSS properties are allowed within style attributes
267
+ # Added for the family tree on /encyclopedia/5825/
268
+ # Hopefully doesn't make anything else too hideous.
269
+ allowed_css_properties = [
270
+ "background",
271
+ "border",
272
+ "border-bottom",
273
+ "border-collapse",
274
+ "border-left",
275
+ "border-radius",
276
+ "border-right",
277
+ "border-spacing",
278
+ "border-top",
279
+ "height",
280
+ "padding",
281
+ "text-align",
282
+ "width",
283
+ ]
284
+
285
+ css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
286
+
287
+ a = bleach.clean(
288
+ html,
289
+ tags=allowed_tags,
290
+ attributes=allowed_attributes,
291
+ css_sanitizer=css_sanitizer,
292
+ strip=True,
293
+ )
294
+
295
+ return a
296
+
297
+
298
+ def _strip_html(self, html):
299
+ """
300
+ Takes out any tags, and their contents, that we don't want at all.
301
+ And adds custom classes to existing tags (so we can apply CSS styles
302
+ without having to multiply our CSS).
303
+
304
+ Pass it an HTML string, it returns the stripped HTML string.
305
+ """
306
+
307
+ # CSS selectors. Strip these and their contents.
308
+ selectors = [
309
+ "div.hatnote",
310
+ "div.navbar.mini", # Will also match div.mini.navbar
311
+ # Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
312
+ "div.topicon",
313
+ "a.mw-headline-anchor",
314
+ "script",
315
+ "style",
316
+ ]
317
+
318
+ # Strip any element that has one of these classes.
319
+ classes = [
320
+ # "This article may be expanded with text translated from..."
321
+ # https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
322
+ "ambox-notice",
323
+ "magnify",
324
+ # eg audio on https://en.wikipedia.org/wiki/Bagpipes
325
+ "mediaContainer",
326
+ "navbox",
327
+ "noprint",
328
+ ]
329
+
330
+ # Any element has a class matching a key, it will have the classes
331
+ # in the value added.
332
+ add_classes = {
333
+ # Give these tables standard Bootstrap styles.
334
+ "infobox": ["table", "table-bordered"],
335
+ "ambox": ["table", "table-bordered"],
336
+ "wikitable": ["table", "table-bordered"],
337
+ }
338
+
339
+ soup = BeautifulSoup(html, "lxml")
340
+
341
+ for selector in selectors:
342
+ [tag.decompose() for tag in soup.select(selector)]
343
+
344
+ for clss in classes:
345
+ [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
346
+
347
+ for clss, new_classes in add_classes.items():
348
+ for tag in soup.find_all(attrs={"class": clss}):
349
+ tag["class"] = tag.get("class", []) + new_classes
350
+
351
+ # Depending on the HTML parser BeautifulSoup used, soup may have
352
+ # surrounding <html><body></body></html> or just <body></body> tags.
353
+ if soup.body:
354
+ soup = soup.body
355
+ elif soup.html:
356
+ soup = soup.html.body
357
+
358
+ # Put the content back into a string.
359
+ html = "".join(str(tag) for tag in soup.contents)
360
+
361
+ return html
requirements.txt CHANGED
@@ -1,7 +1,10 @@
 
1
  duckduckgo-search
2
  googlesearch-python
3
  gradio[oauth]
4
- langchain-community
5
  markdownify
 
6
  requests
7
- smolagents
 
 
 
1
+ bleach
2
  duckduckgo-search
3
  googlesearch-python
4
  gradio[oauth]
 
5
  markdownify
6
+ mwparserfromhell
7
  requests
8
+ smolagents
9
+ tinycss2
10
+ wikipedia-api
tests/test_tools.py CHANGED
@@ -1,8 +1,11 @@
1
  '''Unittests for agent tools.'''
2
 
3
  import unittest
4
- import googlesearch
5
- from functions.tools import google_search
 
 
 
6
 
7
 
8
  class TestGoogleSearch(unittest.TestCase):
@@ -30,8 +33,6 @@ class TestGoogleSearch(unittest.TestCase):
30
  def test_result_content(self):
31
  '''Each search result should contain three elements: title, link, and snippet.'''
32
 
33
- print(type(self.search_results[1]))
34
-
35
  for _, result in self.search_results.items():
36
  self.assertIsInstance(result, dict)
37
  self.assertIn('title', result)
@@ -40,3 +41,57 @@ class TestGoogleSearch(unittest.TestCase):
40
  self.assertIsInstance(result['title'], str)
41
  self.assertIsInstance(result['url'], str)
42
  self.assertIsInstance(result['description'], str)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  '''Unittests for agent tools.'''
2
 
3
  import unittest
4
+ from functions.tools import (
5
+ google_search,
6
+ wikipedia_search,
7
+ get_wikipedia_page
8
+ )
9
 
10
 
11
  class TestGoogleSearch(unittest.TestCase):
 
33
  def test_result_content(self):
34
  '''Each search result should contain three elements: title, link, and snippet.'''
35
 
 
 
36
  for _, result in self.search_results.items():
37
  self.assertIsInstance(result, dict)
38
  self.assertIn('title', result)
 
41
  self.assertIsInstance(result['title'], str)
42
  self.assertIsInstance(result['url'], str)
43
  self.assertIsInstance(result['description'], str)
44
+
45
+
46
+ class TestWikipediaSearch(unittest.TestCase):
47
+ '''Tests for the wikipedia search tool.'''
48
+
49
+
50
+ def setUp(self):
51
+
52
+ wikipedia_search_query = 'Python programming language'
53
+ self.search_results = wikipedia_search(wikipedia_search_query)
54
+
55
+
56
+ def test_result_type(self):
57
+ '''Search results should be a dictionary.'''
58
+
59
+ self.assertIsInstance(self.search_results, dict)
60
+
61
+
62
+ def test_result_length(self):
63
+ '''Search results should contain 5 items.'''
64
+
65
+ self.assertEqual(len(self.search_results), 5)
66
+
67
+
68
+ def test_result_content(self):
69
+ '''Each search result should contain three elements: title, link, and snippet.'''
70
+
71
+ for _, result in self.search_results.items():
72
+ self.assertIsInstance(result, dict)
73
+ self.assertIn('title', result)
74
+ self.assertIn('description', result)
75
+ self.assertIsInstance(result['title'], str)
76
+ self.assertIsInstance(result['description'], str)
77
+
78
+
79
+ class TestGetWikipediaPage(unittest.TestCase):
80
+ '''Tests for the get_wikipedia_page tool.'''
81
+
82
+
83
+ def setUp(self):
84
+
85
+ self.page_content = get_wikipedia_page('Mercedes Sosa')
86
+
87
+
88
+ def test_page_content_type(self):
89
+ '''Page content should be a string.'''
90
+
91
+ self.assertIsInstance(self.page_content, str)
92
+
93
+
94
+ def test_page_content_not_empty(self):
95
+ '''Page content should not be empty.'''
96
+
97
+ self.assertTrue(len(self.page_content) > 0)