Spaces:
Sleeping
Sleeping
Updated wikipedia search tools to get and parse wikipedia pages to HTML so that tables and other non-text elements are visible to agent. Allowed agent to import BeautifulSoup.
Browse files- app.py +5 -5
- configuration.py +6 -1
- functions/agent.py +23 -10
- functions/tools.py +345 -4
- requirements.txt +5 -2
- tests/test_tools.py +59 -4
app.py
CHANGED
@@ -12,10 +12,10 @@ import pandas as pd
|
|
12 |
from functions.agent import create_agent
|
13 |
|
14 |
# --- Constants ---
|
15 |
-
from configuration import DEFAULT_API_URL, INSTRUCTIONS
|
16 |
|
17 |
|
18 |
-
def run_and_submit_all(
|
19 |
"""
|
20 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
21 |
and displays the results.
|
@@ -79,7 +79,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
79 |
|
80 |
print(f'Running agent on {len(questions_data)} questions...')
|
81 |
|
82 |
-
for
|
|
|
83 |
task_id = item.get("task_id")
|
84 |
question_text = item.get("question")
|
85 |
|
@@ -89,8 +90,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
89 |
|
90 |
try:
|
91 |
submitted_answer = agent.run(
|
92 |
-
|
93 |
-
additional_args={'user_prompt': question_text}
|
94 |
)
|
95 |
|
96 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
|
|
12 |
from functions.agent import create_agent
|
13 |
|
14 |
# --- Constants ---
|
15 |
+
from configuration import QUESTIONS, DEFAULT_API_URL, INSTRUCTIONS
|
16 |
|
17 |
|
18 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
19 |
"""
|
20 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
21 |
and displays the results.
|
|
|
79 |
|
80 |
print(f'Running agent on {len(questions_data)} questions...')
|
81 |
|
82 |
+
for question_number in QUESTIONS:
|
83 |
+
item = questions_data[question_number - 1] # Adjust for zero-based index
|
84 |
task_id = item.get("task_id")
|
85 |
question_text = item.get("question")
|
86 |
|
|
|
90 |
|
91 |
try:
|
92 |
submitted_answer = agent.run(
|
93 |
+
question_text
|
|
|
94 |
)
|
95 |
|
96 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
configuration.py
CHANGED
@@ -5,8 +5,13 @@ Contains API URLs and agent instructions used throughout the application.
|
|
5 |
"""
|
6 |
# pylint: disable=line-too-long
|
7 |
|
|
|
|
|
|
|
|
|
8 |
DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
|
9 |
|
|
|
10 |
INSTRUCTIONS = """
|
11 |
-
YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
12 |
"""
|
|
|
5 |
"""
|
6 |
# pylint: disable=line-too-long
|
7 |
|
8 |
+
# Which questions to answer
|
9 |
+
QUESTIONS = [1]
|
10 |
+
|
11 |
+
# GAIA benchmark scoring API
|
12 |
DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
|
13 |
|
14 |
+
# Additional instructions for agent. See here: https://huggingface.co/spaces/gaia-benchmark/leaderboard
|
15 |
INSTRUCTIONS = """
|
16 |
+
You are a general AI assistant. I will ask you a question. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
17 |
"""
|
functions/agent.py
CHANGED
@@ -1,24 +1,37 @@
|
|
1 |
'''Agent definition for GAIA question answering system.'''
|
2 |
|
3 |
# Imports for agent creation
|
4 |
-
from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool
|
5 |
-
from
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
def create_agent():
|
9 |
'''Creates agent for GAIA question answering system.'''
|
10 |
|
11 |
-
wikipedia = Tool.from_langchain(
|
12 |
-
load_tools(["wikipedia"])[0]
|
13 |
-
)
|
14 |
-
|
15 |
model = InferenceClientModel(
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
agent = CodeAgent(
|
20 |
-
tools=
|
21 |
-
model=model
|
|
|
|
|
|
|
22 |
)
|
23 |
|
24 |
return agent
|
|
|
1 |
'''Agent definition for GAIA question answering system.'''
|
2 |
|
3 |
# Imports for agent creation
|
4 |
+
from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool
|
5 |
+
from functions.tools import (
|
6 |
+
google_search,
|
7 |
+
wikipedia_search,
|
8 |
+
get_wikipedia_page
|
9 |
+
)
|
10 |
|
11 |
def create_agent():
|
12 |
'''Creates agent for GAIA question answering system.'''
|
13 |
|
|
|
|
|
|
|
|
|
14 |
model = InferenceClientModel(
|
15 |
+
# max_tokens=8096,
|
16 |
+
# temperature=0.5,
|
17 |
+
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
|
18 |
+
provider='together'
|
19 |
+
# custom_role_conversions=None
|
20 |
)
|
21 |
|
22 |
+
tools = [
|
23 |
+
google_search,
|
24 |
+
wikipedia_search,
|
25 |
+
get_wikipedia_page,
|
26 |
+
VisitWebpageTool()
|
27 |
+
]
|
28 |
+
|
29 |
agent = CodeAgent(
|
30 |
+
tools=tools,
|
31 |
+
model=model,
|
32 |
+
max_steps=20,
|
33 |
+
planning_interval=2,
|
34 |
+
additional_authorized_imports=['bs4.*']
|
35 |
)
|
36 |
|
37 |
return agent
|
functions/tools.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1 |
'''Tools for GAIA question answering agent.'''
|
2 |
|
|
|
|
|
|
|
|
|
3 |
from smolagents import tool
|
4 |
from googlesearch import search
|
5 |
|
6 |
@tool
|
7 |
-
def google_search(query: str) ->
|
8 |
"""
|
9 |
Perform a Google search and return the top 10 results.
|
10 |
|
@@ -12,9 +16,346 @@ def google_search(query: str) -> str:
|
|
12 |
query (str): The search query.
|
13 |
|
14 |
Returns:
|
15 |
-
|
|
|
16 |
"""
|
17 |
|
18 |
-
|
|
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
'''Tools for GAIA question answering agent.'''
|
2 |
|
3 |
+
import bleach
|
4 |
+
import requests
|
5 |
+
from bleach.css_sanitizer import CSSSanitizer
|
6 |
+
from bs4 import BeautifulSoup
|
7 |
from smolagents import tool
|
8 |
from googlesearch import search
|
9 |
|
10 |
@tool
|
11 |
+
def google_search(query: str) -> dict:
|
12 |
"""
|
13 |
Perform a Google search and return the top 10 results.
|
14 |
|
|
|
16 |
query (str): The search query.
|
17 |
|
18 |
Returns:
|
19 |
+
dict: A dictionary containing the search results in the following format.
|
20 |
+
{0: {'title': str, 'url': str, 'description': str}, ...}
|
21 |
"""
|
22 |
|
23 |
+
# Run the query
|
24 |
+
results = list(search(query, num_results=5, advanced=True))
|
25 |
|
26 |
+
# Parse and format the results
|
27 |
+
parsed_results = {}
|
28 |
+
|
29 |
+
for i, result in enumerate(results):
|
30 |
+
|
31 |
+
parsed_results[i] = {
|
32 |
+
'title': result.title,
|
33 |
+
'url': result.url,
|
34 |
+
'description': result.description
|
35 |
+
}
|
36 |
+
|
37 |
+
return parsed_results
|
38 |
+
|
39 |
+
|
40 |
+
@tool
|
41 |
+
def wikipedia_search(query: str) -> dict:
|
42 |
+
"""
|
43 |
+
Perform a search for wikipedia pages and return the top 5 results.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
query (str): The search query.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
dict: A dictionary containing the search results in the following format.
|
50 |
+
{0: {'title': str, 'description': str}, ...}
|
51 |
+
"""
|
52 |
+
|
53 |
+
language_code = 'en'
|
54 |
+
number_of_results = 5
|
55 |
+
headers = {
|
56 |
+
'User-Agent': 'HuggingFace Agents course final project (https://github.com/gperdrizet/unit-four-final-project)'
|
57 |
+
}
|
58 |
+
|
59 |
+
base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
|
60 |
+
endpoint = '/search/page'
|
61 |
+
url = base_url + language_code + endpoint
|
62 |
+
parameters = {'q': query, 'limit': number_of_results}
|
63 |
+
response = requests.get(url, headers=headers, params=parameters, timeout=15)
|
64 |
+
|
65 |
+
if response.status_code == 200:
|
66 |
+
results = response.json().get('pages', [])
|
67 |
+
parsed_results = {}
|
68 |
+
|
69 |
+
else:
|
70 |
+
return f"Error: Unable to retrieve page. Status code {response.status_code}"
|
71 |
+
|
72 |
+
for i, result in enumerate(results):
|
73 |
+
|
74 |
+
parsed_results[i] = {
|
75 |
+
'title': result.get('title', None),
|
76 |
+
'description': result.get('description', None)
|
77 |
+
}
|
78 |
+
|
79 |
+
return parsed_results
|
80 |
+
|
81 |
+
|
82 |
+
@tool
|
83 |
+
def get_wikipedia_page(query: str) -> str:
|
84 |
+
"""
|
85 |
+
Get the content of a Wikipedia page as HTML.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
query (str): The title of the Wikipedia page.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
str: The HTML content of the Wikipedia page.
|
92 |
+
"""
|
93 |
+
|
94 |
+
fetcher = WikipediaFetcher()
|
95 |
+
html_result = fetcher.fetch(query.replace(' ', '_'))
|
96 |
+
|
97 |
+
content = html_result['content']
|
98 |
+
content = content.split('<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>')[0]
|
99 |
+
content = content.split('<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>')[0]
|
100 |
+
|
101 |
+
return content
|
102 |
+
|
103 |
+
|
104 |
+
class WikipediaFetcher:
|
105 |
+
"""Gets and cleans up Wikipedia pages."""
|
106 |
+
|
107 |
+
def fetch(self, page_name):
|
108 |
+
"""
|
109 |
+
Passed a Wikipedia page's URL fragment, like
|
110 |
+
'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
|
111 |
+
main contents, tidy the HTML, strip out any elements we don't want
|
112 |
+
and return the final HTML string.
|
113 |
+
|
114 |
+
Returns a dict with two elements:
|
115 |
+
'success' is either True or, if we couldn't fetch the page, False.
|
116 |
+
'content' is the HTML if success==True, or else an error message.
|
117 |
+
"""
|
118 |
+
result = self._get_html(page_name)
|
119 |
+
|
120 |
+
if result["success"]:
|
121 |
+
result["content"] = self._tidy_html(result["content"])
|
122 |
+
|
123 |
+
return result
|
124 |
+
|
125 |
+
|
126 |
+
def _get_html(self, page_name):
|
127 |
+
"""
|
128 |
+
Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
|
129 |
+
the HTML content (not the entire HTML page) and returns it.
|
130 |
+
|
131 |
+
Returns a dict with two elements:
|
132 |
+
'success' is either True or, if we couldn't fetch the page, False.
|
133 |
+
'content' is the HTML if success==True, or else an error message.
|
134 |
+
"""
|
135 |
+
error_message = ""
|
136 |
+
|
137 |
+
url = f"https://en.wikipedia.org/wiki/{page_name}"
|
138 |
+
|
139 |
+
try:
|
140 |
+
response = requests.get(url, params={"action": "render"}, timeout=5)
|
141 |
+
except requests.exceptions.ConnectionError:
|
142 |
+
error_message = "Can't connect to domain."
|
143 |
+
except requests.exceptions.Timeout:
|
144 |
+
error_message = "Connection timed out."
|
145 |
+
except requests.exceptions.TooManyRedirects:
|
146 |
+
error_message = "Too many redirects."
|
147 |
+
|
148 |
+
try:
|
149 |
+
response.raise_for_status()
|
150 |
+
except requests.exceptions.HTTPError:
|
151 |
+
# 4xx or 5xx errors:
|
152 |
+
error_message = f"HTTP Error: {response.status_code}"
|
153 |
+
except NameError:
|
154 |
+
if error_message == "":
|
155 |
+
error_message = "Something unusual went wrong."
|
156 |
+
|
157 |
+
if error_message:
|
158 |
+
return {"success": False, "content": error_message}
|
159 |
+
else:
|
160 |
+
return {"success": True, "content": response.text}
|
161 |
+
|
162 |
+
|
163 |
+
def _tidy_html(self, html):
|
164 |
+
"""
|
165 |
+
Passed the raw Wikipedia HTML, this returns valid HTML, with all
|
166 |
+
disallowed elements stripped out.
|
167 |
+
"""
|
168 |
+
html = self._bleach_html(html)
|
169 |
+
html = self._strip_html(html)
|
170 |
+
return html
|
171 |
+
|
172 |
+
|
173 |
+
def _bleach_html(self, html):
|
174 |
+
"""
|
175 |
+
Ensures we have valid HTML; no unclosed or mis-nested tags.
|
176 |
+
Removes any tags and attributes we don't want to let through.
|
177 |
+
Doesn't remove the contents of any disallowed tags.
|
178 |
+
|
179 |
+
Pass it an HTML string, it'll return the bleached HTML string.
|
180 |
+
"""
|
181 |
+
|
182 |
+
# Pretty much most elements, but no forms or audio/video.
|
183 |
+
allowed_tags = {
|
184 |
+
"a",
|
185 |
+
"abbr",
|
186 |
+
"acronym",
|
187 |
+
"address",
|
188 |
+
"area",
|
189 |
+
"article",
|
190 |
+
"b",
|
191 |
+
"blockquote",
|
192 |
+
"br",
|
193 |
+
"caption",
|
194 |
+
"cite",
|
195 |
+
"code",
|
196 |
+
"col",
|
197 |
+
"colgroup",
|
198 |
+
"dd",
|
199 |
+
"del",
|
200 |
+
"dfn",
|
201 |
+
"div",
|
202 |
+
"dl",
|
203 |
+
"dt",
|
204 |
+
"em",
|
205 |
+
"figcaption",
|
206 |
+
"figure",
|
207 |
+
"footer",
|
208 |
+
"h1",
|
209 |
+
"h2",
|
210 |
+
"h3",
|
211 |
+
"h4",
|
212 |
+
"h5",
|
213 |
+
"h6",
|
214 |
+
"header",
|
215 |
+
"hgroup",
|
216 |
+
"hr",
|
217 |
+
"i",
|
218 |
+
"img",
|
219 |
+
"ins",
|
220 |
+
"kbd",
|
221 |
+
"li",
|
222 |
+
"map",
|
223 |
+
"nav",
|
224 |
+
"ol",
|
225 |
+
"p",
|
226 |
+
"pre",
|
227 |
+
"q",
|
228 |
+
"s",
|
229 |
+
"samp",
|
230 |
+
"section",
|
231 |
+
"small",
|
232 |
+
"span",
|
233 |
+
"strong",
|
234 |
+
"sub",
|
235 |
+
"sup",
|
236 |
+
"table",
|
237 |
+
"tbody",
|
238 |
+
"td",
|
239 |
+
"tfoot",
|
240 |
+
"th",
|
241 |
+
"thead",
|
242 |
+
"time",
|
243 |
+
"tr",
|
244 |
+
"ul",
|
245 |
+
"var",
|
246 |
+
# We allow script and style here, so we can close/un-mis-nest
|
247 |
+
# its tags, but then it's removed completely in _strip_html():
|
248 |
+
"script",
|
249 |
+
"style",
|
250 |
+
}
|
251 |
+
|
252 |
+
# These attributes will not be removed from any of the allowed tags.
|
253 |
+
allowed_attributes = {
|
254 |
+
"*": ["class", "id"],
|
255 |
+
"a": ["href", "title"],
|
256 |
+
"abbr": ["title"],
|
257 |
+
"acronym": ["title"],
|
258 |
+
"img": ["alt", "src", "srcset"],
|
259 |
+
# Ugh. Don't know why this page doesn't use .tright like others
|
260 |
+
# http://127.0.0.1:8000/encyclopedia/5040/
|
261 |
+
"table": ["align"],
|
262 |
+
"td": ["colspan", "rowspan", "style"],
|
263 |
+
"th": ["colspan", "rowspan", "scope"],
|
264 |
+
}
|
265 |
+
|
266 |
+
# These CSS properties are allowed within style attributes
|
267 |
+
# Added for the family tree on /encyclopedia/5825/
|
268 |
+
# Hopefully doesn't make anything else too hideous.
|
269 |
+
allowed_css_properties = [
|
270 |
+
"background",
|
271 |
+
"border",
|
272 |
+
"border-bottom",
|
273 |
+
"border-collapse",
|
274 |
+
"border-left",
|
275 |
+
"border-radius",
|
276 |
+
"border-right",
|
277 |
+
"border-spacing",
|
278 |
+
"border-top",
|
279 |
+
"height",
|
280 |
+
"padding",
|
281 |
+
"text-align",
|
282 |
+
"width",
|
283 |
+
]
|
284 |
+
|
285 |
+
css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
|
286 |
+
|
287 |
+
a = bleach.clean(
|
288 |
+
html,
|
289 |
+
tags=allowed_tags,
|
290 |
+
attributes=allowed_attributes,
|
291 |
+
css_sanitizer=css_sanitizer,
|
292 |
+
strip=True,
|
293 |
+
)
|
294 |
+
|
295 |
+
return a
|
296 |
+
|
297 |
+
|
298 |
+
def _strip_html(self, html):
|
299 |
+
"""
|
300 |
+
Takes out any tags, and their contents, that we don't want at all.
|
301 |
+
And adds custom classes to existing tags (so we can apply CSS styles
|
302 |
+
without having to multiply our CSS).
|
303 |
+
|
304 |
+
Pass it an HTML string, it returns the stripped HTML string.
|
305 |
+
"""
|
306 |
+
|
307 |
+
# CSS selectors. Strip these and their contents.
|
308 |
+
selectors = [
|
309 |
+
"div.hatnote",
|
310 |
+
"div.navbar.mini", # Will also match div.mini.navbar
|
311 |
+
# Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
|
312 |
+
"div.topicon",
|
313 |
+
"a.mw-headline-anchor",
|
314 |
+
"script",
|
315 |
+
"style",
|
316 |
+
]
|
317 |
+
|
318 |
+
# Strip any element that has one of these classes.
|
319 |
+
classes = [
|
320 |
+
# "This article may be expanded with text translated from..."
|
321 |
+
# https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
|
322 |
+
"ambox-notice",
|
323 |
+
"magnify",
|
324 |
+
# eg audio on https://en.wikipedia.org/wiki/Bagpipes
|
325 |
+
"mediaContainer",
|
326 |
+
"navbox",
|
327 |
+
"noprint",
|
328 |
+
]
|
329 |
+
|
330 |
+
# Any element has a class matching a key, it will have the classes
|
331 |
+
# in the value added.
|
332 |
+
add_classes = {
|
333 |
+
# Give these tables standard Bootstrap styles.
|
334 |
+
"infobox": ["table", "table-bordered"],
|
335 |
+
"ambox": ["table", "table-bordered"],
|
336 |
+
"wikitable": ["table", "table-bordered"],
|
337 |
+
}
|
338 |
+
|
339 |
+
soup = BeautifulSoup(html, "lxml")
|
340 |
+
|
341 |
+
for selector in selectors:
|
342 |
+
[tag.decompose() for tag in soup.select(selector)]
|
343 |
+
|
344 |
+
for clss in classes:
|
345 |
+
[tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
|
346 |
+
|
347 |
+
for clss, new_classes in add_classes.items():
|
348 |
+
for tag in soup.find_all(attrs={"class": clss}):
|
349 |
+
tag["class"] = tag.get("class", []) + new_classes
|
350 |
+
|
351 |
+
# Depending on the HTML parser BeautifulSoup used, soup may have
|
352 |
+
# surrounding <html><body></body></html> or just <body></body> tags.
|
353 |
+
if soup.body:
|
354 |
+
soup = soup.body
|
355 |
+
elif soup.html:
|
356 |
+
soup = soup.html.body
|
357 |
+
|
358 |
+
# Put the content back into a string.
|
359 |
+
html = "".join(str(tag) for tag in soup.contents)
|
360 |
+
|
361 |
+
return html
|
requirements.txt
CHANGED
@@ -1,7 +1,10 @@
|
|
|
|
1 |
duckduckgo-search
|
2 |
googlesearch-python
|
3 |
gradio[oauth]
|
4 |
-
langchain-community
|
5 |
markdownify
|
|
|
6 |
requests
|
7 |
-
smolagents
|
|
|
|
|
|
1 |
+
bleach
|
2 |
duckduckgo-search
|
3 |
googlesearch-python
|
4 |
gradio[oauth]
|
|
|
5 |
markdownify
|
6 |
+
mwparserfromhell
|
7 |
requests
|
8 |
+
smolagents
|
9 |
+
tinycss2
|
10 |
+
wikipedia-api
|
tests/test_tools.py
CHANGED
@@ -1,8 +1,11 @@
|
|
1 |
'''Unittests for agent tools.'''
|
2 |
|
3 |
import unittest
|
4 |
-
import
|
5 |
-
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class TestGoogleSearch(unittest.TestCase):
|
@@ -30,8 +33,6 @@ class TestGoogleSearch(unittest.TestCase):
|
|
30 |
def test_result_content(self):
|
31 |
'''Each search result should contain three elements: title, link, and snippet.'''
|
32 |
|
33 |
-
print(type(self.search_results[1]))
|
34 |
-
|
35 |
for _, result in self.search_results.items():
|
36 |
self.assertIsInstance(result, dict)
|
37 |
self.assertIn('title', result)
|
@@ -40,3 +41,57 @@ class TestGoogleSearch(unittest.TestCase):
|
|
40 |
self.assertIsInstance(result['title'], str)
|
41 |
self.assertIsInstance(result['url'], str)
|
42 |
self.assertIsInstance(result['description'], str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
'''Unittests for agent tools.'''
|
2 |
|
3 |
import unittest
|
4 |
+
from functions.tools import (
|
5 |
+
google_search,
|
6 |
+
wikipedia_search,
|
7 |
+
get_wikipedia_page
|
8 |
+
)
|
9 |
|
10 |
|
11 |
class TestGoogleSearch(unittest.TestCase):
|
|
|
33 |
def test_result_content(self):
|
34 |
'''Each search result should contain three elements: title, link, and snippet.'''
|
35 |
|
|
|
|
|
36 |
for _, result in self.search_results.items():
|
37 |
self.assertIsInstance(result, dict)
|
38 |
self.assertIn('title', result)
|
|
|
41 |
self.assertIsInstance(result['title'], str)
|
42 |
self.assertIsInstance(result['url'], str)
|
43 |
self.assertIsInstance(result['description'], str)
|
44 |
+
|
45 |
+
|
46 |
+
class TestWikipediaSearch(unittest.TestCase):
|
47 |
+
'''Tests for the wikipedia search tool.'''
|
48 |
+
|
49 |
+
|
50 |
+
def setUp(self):
|
51 |
+
|
52 |
+
wikipedia_search_query = 'Python programming language'
|
53 |
+
self.search_results = wikipedia_search(wikipedia_search_query)
|
54 |
+
|
55 |
+
|
56 |
+
def test_result_type(self):
|
57 |
+
'''Search results should be a dictionary.'''
|
58 |
+
|
59 |
+
self.assertIsInstance(self.search_results, dict)
|
60 |
+
|
61 |
+
|
62 |
+
def test_result_length(self):
|
63 |
+
'''Search results should contain 5 items.'''
|
64 |
+
|
65 |
+
self.assertEqual(len(self.search_results), 5)
|
66 |
+
|
67 |
+
|
68 |
+
def test_result_content(self):
|
69 |
+
'''Each search result should contain three elements: title, link, and snippet.'''
|
70 |
+
|
71 |
+
for _, result in self.search_results.items():
|
72 |
+
self.assertIsInstance(result, dict)
|
73 |
+
self.assertIn('title', result)
|
74 |
+
self.assertIn('description', result)
|
75 |
+
self.assertIsInstance(result['title'], str)
|
76 |
+
self.assertIsInstance(result['description'], str)
|
77 |
+
|
78 |
+
|
79 |
+
class TestGetWikipediaPage(unittest.TestCase):
|
80 |
+
'''Tests for the get_wikipedia_page tool.'''
|
81 |
+
|
82 |
+
|
83 |
+
def setUp(self):
|
84 |
+
|
85 |
+
self.page_content = get_wikipedia_page('Mercedes Sosa')
|
86 |
+
|
87 |
+
|
88 |
+
def test_page_content_type(self):
|
89 |
+
'''Page content should be a string.'''
|
90 |
+
|
91 |
+
self.assertIsInstance(self.page_content, str)
|
92 |
+
|
93 |
+
|
94 |
+
def test_page_content_not_empty(self):
|
95 |
+
'''Page content should not be empty.'''
|
96 |
+
|
97 |
+
self.assertTrue(len(self.page_content) > 0)
|