Spaces:
Sleeping
Sleeping
Switched to single agent powered by GPT-4.1, added step wait function to avoid hitting the OpenAI API rate limit.
Browse files- configuration.py +30 -4
- functions/agent.py +66 -116
- functions/agent_helper_functions.py +142 -0
- functions/tool_helper_functions.py +286 -3
- functions/tools.py +3 -264
- results.csv +4 -4
configuration.py
CHANGED
@@ -1,8 +1,7 @@
|
|
|
|
|
|
|
|
1 |
|
2 |
-
"""
|
3 |
-
Configuration constants for the GAIA agent project.
|
4 |
-
Contains API URLs and agent instructions used throughout the application.
|
5 |
-
"""
|
6 |
# pylint: disable=line-too-long
|
7 |
|
8 |
# Which questions to answer
|
@@ -15,3 +14,30 @@ DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
|
|
15 |
INSTRUCTIONS = """
|
16 |
You are a general AI assistant. I will ask you a question. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. Submit the final answer via the final_answer tool.
|
17 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Configuration constants for the GAIA agent project."""
|
2 |
+
|
3 |
+
from smolagents import OpenAIServerModel, InferenceClientModel
|
4 |
|
|
|
|
|
|
|
|
|
5 |
# pylint: disable=line-too-long
|
6 |
|
7 |
# Which questions to answer
|
|
|
14 |
INSTRUCTIONS = """
|
15 |
You are a general AI assistant. I will ask you a question. Your final answer should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string. Submit the final answer via the final_answer tool.
|
16 |
"""
|
17 |
+
|
18 |
+
# Agent model definitions
|
19 |
+
MANAGER_MODEL = InferenceClientModel(
|
20 |
+
"deepseek-ai/DeepSeek-V3",
|
21 |
+
provider="together",
|
22 |
+
max_tokens=64000
|
23 |
+
)
|
24 |
+
|
25 |
+
WORKER_MODEL = InferenceClientModel(
|
26 |
+
"deepseek-ai/DeepSeek-V3",
|
27 |
+
provider="together",
|
28 |
+
max_tokens=64000
|
29 |
+
)
|
30 |
+
|
31 |
+
CHECK_MODEL = InferenceClientModel(
|
32 |
+
"deepseek-ai/DeepSeek-V3",
|
33 |
+
provider="together",
|
34 |
+
max_tokens=64000
|
35 |
+
)
|
36 |
+
|
37 |
+
MODEL = OpenAIServerModel(
|
38 |
+
model_id="gpt-4.1",
|
39 |
+
max_tokens=8000
|
40 |
+
)
|
41 |
+
|
42 |
+
TOKEN_LIMITER = 5000
|
43 |
+
STEP_WAIT = 60
|
functions/agent.py
CHANGED
@@ -1,141 +1,91 @@
|
|
1 |
'''Agent definition for GAIA question answering system.'''
|
2 |
|
3 |
# Standard library
|
4 |
-
import os
|
5 |
-
import json
|
6 |
import logging
|
7 |
|
8 |
-
from openai import OpenAI
|
9 |
-
|
10 |
# Imports for agent creation
|
11 |
-
from smolagents import CodeAgent,
|
|
|
12 |
from functions.tools import (
|
13 |
google_search,
|
14 |
wikipedia_search,
|
15 |
-
get_wikipedia_page
|
|
|
|
|
16 |
)
|
17 |
|
|
|
|
|
|
|
18 |
# Get logger for this module
|
19 |
logger = logging.getLogger(__name__)
|
20 |
|
21 |
def create_agent():
|
22 |
'''Creates agent for GAIA question answering system.'''
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
agent = CodeAgent(
|
38 |
-
model=
|
39 |
-
tools=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
additional_authorized_imports=['bs4.*', 'json'],
|
41 |
-
step_callbacks=[step_memory_cap],
|
42 |
name="GAIA_agent",
|
43 |
verbosity_level=5,
|
44 |
-
max_steps=
|
45 |
-
planning_interval=
|
46 |
-
description="GAIA agent for question answering"
|
47 |
)
|
48 |
|
49 |
-
|
50 |
return agent
|
51 |
-
|
52 |
-
|
53 |
-
def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
|
54 |
-
'''Removes old steps from agent memory to keep context length under control.'''
|
55 |
-
|
56 |
-
task_step = agent.memory.steps[0]
|
57 |
-
planning_step = agent.memory.steps[1]
|
58 |
-
latest_step = agent.memory.steps[-1]
|
59 |
-
|
60 |
-
if len(agent.memory.steps) > 2:
|
61 |
-
agent.memory.steps = [task_step, planning_step, latest_step]
|
62 |
-
|
63 |
-
logger.info('Agent memory has %d steps', len(agent.memory.steps))
|
64 |
-
logger.info('Latest step is step %d', memory_step.step_number)
|
65 |
-
logger.info('Contains: %s messages', len(agent.memory.steps[-1].model_input_messages))
|
66 |
-
logger.info('Token usage: %s', agent.memory.steps[-1].token_usage.total_tokens)
|
67 |
-
|
68 |
-
for message in agent.memory.steps[-1].model_input_messages:
|
69 |
-
logger.debug(' Role: %s: %s', message['role'], message['content'][:100])
|
70 |
-
|
71 |
-
token_usage = agent.memory.steps[-1].token_usage.total_tokens
|
72 |
-
|
73 |
-
if token_usage > 50000:
|
74 |
-
logger.info('Token usage is %d, summarizing old messages', token_usage)
|
75 |
-
|
76 |
-
summary = summarize_old_messages(
|
77 |
-
agent.memory.steps[-1].model_input_messages[1:]
|
78 |
-
)
|
79 |
-
|
80 |
-
if summary is not None:
|
81 |
-
|
82 |
-
new_messages = [agent.memory.steps[-1].model_input_messages[0]]
|
83 |
-
new_messages.append({
|
84 |
-
'role': MessageRole.USER,
|
85 |
-
'content': [{'type': 'text', 'text': f'Here is a summary of your investigation so far: {summary}'}]
|
86 |
-
})
|
87 |
-
agent.memory.steps = [agent.memory.steps[0]]
|
88 |
-
agent.memory.steps[0].model_input_messages = new_messages
|
89 |
-
|
90 |
-
for message in agent.memory.steps[0].model_input_messages:
|
91 |
-
logger.debug(' Role: %s: %s', message['role'], message['content'][:100])
|
92 |
-
|
93 |
-
|
94 |
-
def summarize_old_messages(messages: dict) -> dict:
|
95 |
-
'''Summarizes old messages to keep context length under control.'''
|
96 |
-
|
97 |
-
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
98 |
-
|
99 |
-
client.base_url = (
|
100 |
-
'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
|
101 |
-
)
|
102 |
-
|
103 |
-
# Default to first avalible model
|
104 |
-
model = client.models.list().data[0]
|
105 |
-
model_id = model.id
|
106 |
-
|
107 |
-
messages = [
|
108 |
-
{
|
109 |
-
'role': 'system',
|
110 |
-
'content': f'Summarize the following interaction between an AI agent and a user. Return the summary formatted as text, not as JSON: {json.dumps(messages)}'
|
111 |
-
}
|
112 |
-
]
|
113 |
-
|
114 |
-
completion_args = {
|
115 |
-
'model': model_id,
|
116 |
-
'messages': messages,
|
117 |
-
# "frequency_penalty": args.frequency_penalty,
|
118 |
-
# "max_tokens": 128,
|
119 |
-
# "n": args.n,
|
120 |
-
# "presence_penalty": args.presence_penalty,
|
121 |
-
# "seed": args.seed,
|
122 |
-
# "stop": args.stop,
|
123 |
-
# "stream": args.stream,
|
124 |
-
# "temperature": args.temperature,
|
125 |
-
# "top_p": args.top_p,
|
126 |
-
}
|
127 |
-
|
128 |
-
try:
|
129 |
-
response = client.chat.completions.create(**completion_args)
|
130 |
-
|
131 |
-
except Exception as e: # pylint: disable=broad-exception-caught
|
132 |
-
response = None
|
133 |
-
logger.error('Error during Modal API call: %s', e)
|
134 |
-
|
135 |
-
if response is not None:
|
136 |
-
summary = response.choices[0].message.content
|
137 |
-
|
138 |
-
else:
|
139 |
-
summary = None
|
140 |
-
|
141 |
-
return summary
|
|
|
1 |
'''Agent definition for GAIA question answering system.'''
|
2 |
|
3 |
# Standard library
|
|
|
|
|
4 |
import logging
|
5 |
|
|
|
|
|
6 |
# Imports for agent creation
|
7 |
+
from smolagents import CodeAgent, VisitWebpageTool
|
8 |
+
|
9 |
from functions.tools import (
|
10 |
google_search,
|
11 |
wikipedia_search,
|
12 |
+
get_wikipedia_page,
|
13 |
+
libretext_book_search,
|
14 |
+
get_libretext_book
|
15 |
)
|
16 |
|
17 |
+
from functions.agent_helper_functions import step_memory_cap, step_wait
|
18 |
+
from configuration import MODEL
|
19 |
+
|
20 |
# Get logger for this module
|
21 |
logger = logging.getLogger(__name__)
|
22 |
|
23 |
def create_agent():
|
24 |
'''Creates agent for GAIA question answering system.'''
|
25 |
|
26 |
+
# web_agent = CodeAgent(
|
27 |
+
# model=WORKER_MODEL,
|
28 |
+
# tools=[google_search, VisitWebpageTool()],
|
29 |
+
# additional_authorized_imports=['bs4.*', 'json'],
|
30 |
+
# step_callbacks=[step_memory_cap],
|
31 |
+
# name="web_agent",
|
32 |
+
# verbosity_level=5,
|
33 |
+
# max_steps=10,
|
34 |
+
# planning_interval=5,
|
35 |
+
# description="Web search agent for general queries and retrieving web pages as HTML",
|
36 |
+
# )
|
37 |
+
|
38 |
+
# wikipedia_agent = CodeAgent(
|
39 |
+
# model=WORKER_MODEL,
|
40 |
+
# tools=[wikipedia_search, get_wikipedia_page],
|
41 |
+
# additional_authorized_imports=['bs4.*', 'json'],
|
42 |
+
# step_callbacks=[step_memory_cap],
|
43 |
+
# name="wikipedia_agent",
|
44 |
+
# verbosity_level=5,
|
45 |
+
# max_steps=10,
|
46 |
+
# planning_interval=5,
|
47 |
+
# description="Wikipedia agent to search and retrieve Wikipedia pages as HTML",
|
48 |
+
# )
|
49 |
+
|
50 |
+
# libretext_agent = CodeAgent(
|
51 |
+
# model=WORKER_MODEL,
|
52 |
+
# tools=[libretext_book_search, get_libretext_book],
|
53 |
+
# additional_authorized_imports=['bs4.*', 'json'],
|
54 |
+
# step_callbacks=[step_memory_cap],
|
55 |
+
# name="libretext_agent",
|
56 |
+
# verbosity_level=5,
|
57 |
+
# max_steps=10,
|
58 |
+
# planning_interval=5,
|
59 |
+
# description="LibreText agent to search and retrieve content from academic textbooks books",
|
60 |
+
# )
|
61 |
+
|
62 |
+
# manager_agent = CodeAgent(
|
63 |
+
# model=MANAGER_MODEL,
|
64 |
+
# tools=[],
|
65 |
+
# managed_agents=[web_agent, wikipedia_agent, libretext_agent],
|
66 |
+
# additional_authorized_imports=['bs4.*', 'json'],
|
67 |
+
# planning_interval=2,
|
68 |
+
# verbosity_level=2,
|
69 |
+
# final_answer_checks=[check_reasoning],
|
70 |
+
# max_steps=20,
|
71 |
+
# )
|
72 |
|
73 |
agent = CodeAgent(
|
74 |
+
model=MODEL,
|
75 |
+
tools=[
|
76 |
+
google_search,
|
77 |
+
VisitWebpageTool(),
|
78 |
+
wikipedia_search,
|
79 |
+
get_wikipedia_page,
|
80 |
+
libretext_book_search,
|
81 |
+
get_libretext_book
|
82 |
+
],
|
83 |
additional_authorized_imports=['bs4.*', 'json'],
|
84 |
+
step_callbacks=[step_memory_cap, step_wait],
|
85 |
name="GAIA_agent",
|
86 |
verbosity_level=5,
|
87 |
+
max_steps=20,
|
88 |
+
planning_interval=5
|
|
|
89 |
)
|
90 |
|
|
|
91 |
return agent
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/agent_helper_functions.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''Helper functions for the agent(s) in the GAIA question answering system.'''
|
2 |
+
|
3 |
+
import os
|
4 |
+
import time
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
from openai import OpenAI
|
8 |
+
from smolagents import CodeAgent, ActionStep, MessageRole
|
9 |
+
from configuration import CHECK_MODEL, TOKEN_LIMITER, STEP_WAIT
|
10 |
+
|
11 |
+
# Get logger for this module
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
|
15 |
+
def check_reasoning(final_answer:str, agent_memory):
|
16 |
+
"""Checks the reasoning and plot of the agent's final answer."""
|
17 |
+
|
18 |
+
prompt = (
|
19 |
+
f"Here is a user-given task and the agent steps: {agent_memory.get_succinct_steps()}. " +
|
20 |
+
"Please check that the reasoning process and answer are correct. " +
|
21 |
+
"Do they correctly answer the given task? " +
|
22 |
+
"First list reasons why yes/no, then write your final decision: " +
|
23 |
+
"PASS in caps lock if it is satisfactory, FAIL if it is not. " +
|
24 |
+
f"Final answer: {str(final_answer)}"
|
25 |
+
)
|
26 |
+
|
27 |
+
messages = [
|
28 |
+
{
|
29 |
+
"role": "user",
|
30 |
+
"content": [
|
31 |
+
{
|
32 |
+
"type": "text",
|
33 |
+
"text": prompt,
|
34 |
+
}
|
35 |
+
],
|
36 |
+
}
|
37 |
+
]
|
38 |
+
|
39 |
+
output = CHECK_MODEL(messages).content
|
40 |
+
print("Feedback: ", output)
|
41 |
+
|
42 |
+
if "FAIL" in output:
|
43 |
+
raise Exception(output) # pylint:disable=broad-exception-raised
|
44 |
+
|
45 |
+
return True
|
46 |
+
|
47 |
+
|
48 |
+
def step_memory_cap(memory_step: ActionStep, agent: CodeAgent) -> None:
|
49 |
+
'''Removes old steps from agent memory to keep context length under control.'''
|
50 |
+
|
51 |
+
task_step = agent.memory.steps[0]
|
52 |
+
planning_step = agent.memory.steps[1]
|
53 |
+
latest_step = agent.memory.steps[-1]
|
54 |
+
|
55 |
+
if len(agent.memory.steps) > 2:
|
56 |
+
agent.memory.steps = [task_step, planning_step, latest_step]
|
57 |
+
|
58 |
+
logger.info('Agent memory has %d steps', len(agent.memory.steps))
|
59 |
+
logger.info('Latest step is step %d', memory_step.step_number)
|
60 |
+
logger.info('Contains: %s messages', len(agent.memory.steps[-1].model_input_messages))
|
61 |
+
logger.info('Token usage: %s', agent.memory.steps[-1].token_usage.total_tokens)
|
62 |
+
|
63 |
+
for message in agent.memory.steps[-1].model_input_messages:
|
64 |
+
logger.debug(' Role: %s: %s', message['role'], message['content'][:100])
|
65 |
+
|
66 |
+
token_usage = agent.memory.steps[-1].token_usage.total_tokens
|
67 |
+
|
68 |
+
if token_usage > TOKEN_LIMITER:
|
69 |
+
logger.info('Token usage is %d, summarizing old messages', token_usage)
|
70 |
+
|
71 |
+
summary = summarize_old_messages(
|
72 |
+
agent.memory.steps[-1].model_input_messages[1:]
|
73 |
+
)
|
74 |
+
|
75 |
+
if summary is not None:
|
76 |
+
|
77 |
+
new_messages = [agent.memory.steps[-1].model_input_messages[0]]
|
78 |
+
new_messages.append({
|
79 |
+
'role': MessageRole.USER,
|
80 |
+
'content': [{
|
81 |
+
'type': 'text',
|
82 |
+
'text': f'Here is a summary of your investigation so far: {summary}'
|
83 |
+
}]
|
84 |
+
})
|
85 |
+
agent.memory.steps = [agent.memory.steps[0]]
|
86 |
+
agent.memory.steps[0].model_input_messages = new_messages
|
87 |
+
|
88 |
+
for message in agent.memory.steps[0].model_input_messages:
|
89 |
+
logger.debug(' Role: %s: %s', message['role'], message['content'][:100])
|
90 |
+
|
91 |
+
|
92 |
+
def summarize_old_messages(messages: dict) -> dict:
|
93 |
+
'''Summarizes old messages to keep context length under control.'''
|
94 |
+
|
95 |
+
client = OpenAI(api_key=os.environ['MODAL_API_KEY'])
|
96 |
+
|
97 |
+
client.base_url = (
|
98 |
+
'https://gperdrizet--vllm-openai-compatible-summarization-serve.modal.run/v1'
|
99 |
+
)
|
100 |
+
|
101 |
+
# Default to first avalible model
|
102 |
+
model = client.models.list().data[0]
|
103 |
+
model_id = model.id
|
104 |
+
|
105 |
+
messages = [
|
106 |
+
{
|
107 |
+
'role': 'system',
|
108 |
+
'content': ('Summarize the following interaction between an AI agent and a user.' +
|
109 |
+
f'Return the summary formatted as text, not as JSON: {json.dumps(messages)}')
|
110 |
+
}
|
111 |
+
]
|
112 |
+
|
113 |
+
completion_args = {
|
114 |
+
'model': model_id,
|
115 |
+
'messages': messages,
|
116 |
+
}
|
117 |
+
|
118 |
+
try:
|
119 |
+
response = client.chat.completions.create(**completion_args)
|
120 |
+
|
121 |
+
except Exception as e: # pylint: disable=broad-exception-caught
|
122 |
+
response = None
|
123 |
+
logger.error('Error during Modal API call: %s', e)
|
124 |
+
|
125 |
+
if response is not None:
|
126 |
+
summary = response.choices[0].message.content
|
127 |
+
|
128 |
+
else:
|
129 |
+
summary = None
|
130 |
+
|
131 |
+
return summary
|
132 |
+
|
133 |
+
def step_wait(memory_step: ActionStep, agent: CodeAgent) -> None:
|
134 |
+
'''Waits for a while to prevent hitting API rate limits.'''
|
135 |
+
|
136 |
+
logger.info('Waiting for %d seconds to prevent hitting API rate limits', STEP_WAIT)
|
137 |
+
logger.info('Current step is %d', memory_step.step_number)
|
138 |
+
logger.info('Current agent has %d steps', len(agent.memory.steps))
|
139 |
+
|
140 |
+
time.sleep(STEP_WAIT)
|
141 |
+
|
142 |
+
return True
|
functions/tool_helper_functions.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
'''Helper functions for GAIA question answering agent tools.'''
|
2 |
|
|
|
3 |
import time
|
4 |
import logging
|
5 |
-
import
|
6 |
from bs4 import BeautifulSoup
|
|
|
7 |
|
8 |
# Get logger for this module
|
9 |
logger = logging.getLogger(__name__)
|
@@ -52,10 +54,12 @@ def libretext_book_parser(url: str) -> dict:
|
|
52 |
chapter_count = 0
|
53 |
|
54 |
for listing in chapter_listings:
|
|
|
55 |
# Extract the link element
|
56 |
link = listing.find('a', class_='mt-sortable-listing-link')
|
57 |
|
58 |
if link:
|
|
|
59 |
# Extract title from the span with class 'mt-sortable-listing-title'
|
60 |
title_span = link.find('span', class_='mt-sortable-listing-title')
|
61 |
title = title_span.get_text(strip=True) if title_span else ''
|
@@ -69,11 +73,13 @@ def libretext_book_parser(url: str) -> dict:
|
|
69 |
# Clean up description - remove the title prefix if it appears
|
70 |
if description and title and description.startswith(title):
|
71 |
description = description[len(title):].strip()
|
|
|
72 |
if description.startswith(':'):
|
73 |
description = description[1:].strip()
|
74 |
|
75 |
# Only add meaningful chapters (skip empty titles or very short ones)
|
76 |
if title and len(title) > 2:
|
|
|
77 |
parsed_chapters[chapter_count] = {
|
78 |
'title': title,
|
79 |
'url': chapter_url,
|
@@ -85,14 +91,17 @@ def libretext_book_parser(url: str) -> dict:
|
|
85 |
chapter_count += 1
|
86 |
|
87 |
logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
|
|
|
88 |
return parsed_chapters
|
89 |
|
90 |
except requests.exceptions.RequestException as e:
|
91 |
logger.error('Request error while fetching book page: %s', str(e))
|
|
|
92 |
return {'error': f'Request error: {str(e)}'}
|
93 |
|
94 |
except Exception as e: # pylint:disable=broad-exception-caught
|
95 |
logger.error('Unexpected error in book parser: %s', str(e))
|
|
|
96 |
return {'error': f'Unexpected error: {str(e)}'}
|
97 |
|
98 |
|
@@ -233,40 +242,52 @@ def save_libretext_book_as_markdown(book_data: dict, filename: str = None, sourc
|
|
233 |
|
234 |
# Table of contents
|
235 |
chapters = book_data.get('chapters', {})
|
|
|
236 |
if chapters:
|
237 |
markdown_content.append("## Table of Contents\n")
|
|
|
238 |
for chapter_title in chapters.keys():
|
|
|
239 |
# Create anchor link for the chapter
|
240 |
-
anchor = chapter_title.lower().replace(
|
|
|
|
|
|
|
|
|
241 |
markdown_content.append(f"- [{chapter_title}](#{anchor})\n")
|
242 |
markdown_content.append("\n---\n\n")
|
243 |
|
244 |
# Chapter content
|
245 |
for chapter_title, chapter_data in chapters.items():
|
|
|
246 |
# Chapter heading
|
247 |
markdown_content.append(f"## {chapter_title}\n\n")
|
248 |
|
249 |
sections = chapter_data.get('sections', {})
|
250 |
|
251 |
if not sections:
|
|
|
252 |
markdown_content.append("*No sections found for this chapter.*\n\n")
|
253 |
continue
|
254 |
|
255 |
# Section content
|
256 |
for section_title, section_data in sections.items():
|
|
|
257 |
# Section heading
|
258 |
markdown_content.append(f"### {section_title}\n\n")
|
259 |
|
260 |
# Section URL
|
261 |
section_url = section_data.get('Section url', '')
|
|
|
262 |
if section_url:
|
263 |
markdown_content.append(f"**URL:** [{section_url}]({section_url})\n\n")
|
264 |
|
265 |
# Section summary
|
266 |
section_summary = section_data.get('Section summary', '')
|
|
|
267 |
if section_summary:
|
268 |
markdown_content.append(f"{section_summary}\n\n")
|
269 |
-
|
270 |
markdown_content.append("*No summary available.*\n\n")
|
271 |
|
272 |
markdown_content.append("---\n\n")
|
@@ -277,9 +298,271 @@ def save_libretext_book_as_markdown(book_data: dict, filename: str = None, sourc
|
|
277 |
|
278 |
success_msg = f"Successfully saved LibreTexts book as markdown file: {filename}"
|
279 |
logger.info(success_msg)
|
|
|
280 |
return success_msg
|
281 |
|
282 |
except Exception as e: # pylint:disable=broad-exception-caught
|
283 |
error_msg = f"Error saving markdown file: {str(e)}"
|
284 |
logger.error(error_msg)
|
|
|
285 |
return error_msg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
'''Helper functions for GAIA question answering agent tools.'''
|
2 |
|
3 |
+
import requests
|
4 |
import time
|
5 |
import logging
|
6 |
+
import bleach
|
7 |
from bs4 import BeautifulSoup
|
8 |
+
from bleach.css_sanitizer import CSSSanitizer
|
9 |
|
10 |
# Get logger for this module
|
11 |
logger = logging.getLogger(__name__)
|
|
|
54 |
chapter_count = 0
|
55 |
|
56 |
for listing in chapter_listings:
|
57 |
+
|
58 |
# Extract the link element
|
59 |
link = listing.find('a', class_='mt-sortable-listing-link')
|
60 |
|
61 |
if link:
|
62 |
+
|
63 |
# Extract title from the span with class 'mt-sortable-listing-title'
|
64 |
title_span = link.find('span', class_='mt-sortable-listing-title')
|
65 |
title = title_span.get_text(strip=True) if title_span else ''
|
|
|
73 |
# Clean up description - remove the title prefix if it appears
|
74 |
if description and title and description.startswith(title):
|
75 |
description = description[len(title):].strip()
|
76 |
+
|
77 |
if description.startswith(':'):
|
78 |
description = description[1:].strip()
|
79 |
|
80 |
# Only add meaningful chapters (skip empty titles or very short ones)
|
81 |
if title and len(title) > 2:
|
82 |
+
|
83 |
parsed_chapters[chapter_count] = {
|
84 |
'title': title,
|
85 |
'url': chapter_url,
|
|
|
91 |
chapter_count += 1
|
92 |
|
93 |
logger.info('Successfully extracted %d chapters from book', len(parsed_chapters))
|
94 |
+
|
95 |
return parsed_chapters
|
96 |
|
97 |
except requests.exceptions.RequestException as e:
|
98 |
logger.error('Request error while fetching book page: %s', str(e))
|
99 |
+
|
100 |
return {'error': f'Request error: {str(e)}'}
|
101 |
|
102 |
except Exception as e: # pylint:disable=broad-exception-caught
|
103 |
logger.error('Unexpected error in book parser: %s', str(e))
|
104 |
+
|
105 |
return {'error': f'Unexpected error: {str(e)}'}
|
106 |
|
107 |
|
|
|
242 |
|
243 |
# Table of contents
|
244 |
chapters = book_data.get('chapters', {})
|
245 |
+
|
246 |
if chapters:
|
247 |
markdown_content.append("## Table of Contents\n")
|
248 |
+
|
249 |
for chapter_title in chapters.keys():
|
250 |
+
|
251 |
# Create anchor link for the chapter
|
252 |
+
anchor = chapter_title.lower().replace(
|
253 |
+
' ',
|
254 |
+
'-'
|
255 |
+
).replace(':', '').replace('(', '').replace(')', '')
|
256 |
+
|
257 |
markdown_content.append(f"- [{chapter_title}](#{anchor})\n")
|
258 |
markdown_content.append("\n---\n\n")
|
259 |
|
260 |
# Chapter content
|
261 |
for chapter_title, chapter_data in chapters.items():
|
262 |
+
|
263 |
# Chapter heading
|
264 |
markdown_content.append(f"## {chapter_title}\n\n")
|
265 |
|
266 |
sections = chapter_data.get('sections', {})
|
267 |
|
268 |
if not sections:
|
269 |
+
|
270 |
markdown_content.append("*No sections found for this chapter.*\n\n")
|
271 |
continue
|
272 |
|
273 |
# Section content
|
274 |
for section_title, section_data in sections.items():
|
275 |
+
|
276 |
# Section heading
|
277 |
markdown_content.append(f"### {section_title}\n\n")
|
278 |
|
279 |
# Section URL
|
280 |
section_url = section_data.get('Section url', '')
|
281 |
+
|
282 |
if section_url:
|
283 |
markdown_content.append(f"**URL:** [{section_url}]({section_url})\n\n")
|
284 |
|
285 |
# Section summary
|
286 |
section_summary = section_data.get('Section summary', '')
|
287 |
+
|
288 |
if section_summary:
|
289 |
markdown_content.append(f"{section_summary}\n\n")
|
290 |
+
|
291 |
markdown_content.append("*No summary available.*\n\n")
|
292 |
|
293 |
markdown_content.append("---\n\n")
|
|
|
298 |
|
299 |
success_msg = f"Successfully saved LibreTexts book as markdown file: {filename}"
|
300 |
logger.info(success_msg)
|
301 |
+
|
302 |
return success_msg
|
303 |
|
304 |
except Exception as e: # pylint:disable=broad-exception-caught
|
305 |
error_msg = f"Error saving markdown file: {str(e)}"
|
306 |
logger.error(error_msg)
|
307 |
+
|
308 |
return error_msg
|
309 |
+
|
310 |
+
|
311 |
+
class WikipediaFetcher:
|
312 |
+
"""Gets and cleans up Wikipedia pages."""
|
313 |
+
|
314 |
+
def fetch(self, page_name):
|
315 |
+
"""
|
316 |
+
Passed a Wikipedia page's URL fragment, like
|
317 |
+
'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
|
318 |
+
main contents, tidy the HTML, strip out any elements we don't want
|
319 |
+
and return the final HTML string.
|
320 |
+
|
321 |
+
Returns a dict with two elements:
|
322 |
+
'success' is either True or, if we couldn't fetch the page, False.
|
323 |
+
'content' is the HTML if success==True, or else an error message.
|
324 |
+
"""
|
325 |
+
result = self._get_html(page_name)
|
326 |
+
|
327 |
+
if result["success"]:
|
328 |
+
result["content"] = self._tidy_html(result["content"])
|
329 |
+
|
330 |
+
return result
|
331 |
+
|
332 |
+
|
333 |
+
def _get_html(self, page_name):
|
334 |
+
"""
|
335 |
+
Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
|
336 |
+
the HTML content (not the entire HTML page) and returns it.
|
337 |
+
|
338 |
+
Returns a dict with two elements:
|
339 |
+
'success' is either True or, if we couldn't fetch the page, False.
|
340 |
+
'content' is the HTML if success==True, or else an error message.
|
341 |
+
"""
|
342 |
+
error_message = ""
|
343 |
+
|
344 |
+
url = f"https://en.wikipedia.org/wiki/{page_name}"
|
345 |
+
|
346 |
+
try:
|
347 |
+
response = requests.get(url, params={"action": "render"}, timeout=5)
|
348 |
+
except requests.exceptions.ConnectionError:
|
349 |
+
error_message = "Can't connect to domain."
|
350 |
+
except requests.exceptions.Timeout:
|
351 |
+
error_message = "Connection timed out."
|
352 |
+
except requests.exceptions.TooManyRedirects:
|
353 |
+
error_message = "Too many redirects."
|
354 |
+
|
355 |
+
try:
|
356 |
+
response.raise_for_status()
|
357 |
+
except requests.exceptions.HTTPError:
|
358 |
+
# 4xx or 5xx errors:
|
359 |
+
error_message = f"HTTP Error: {response.status_code}"
|
360 |
+
except NameError:
|
361 |
+
if error_message == "":
|
362 |
+
error_message = "Something unusual went wrong."
|
363 |
+
|
364 |
+
if error_message:
|
365 |
+
return {"success": False, "content": error_message}
|
366 |
+
else:
|
367 |
+
return {"success": True, "content": response.text}
|
368 |
+
|
369 |
+
|
370 |
+
def _tidy_html(self, html):
|
371 |
+
"""
|
372 |
+
Passed the raw Wikipedia HTML, this returns valid HTML, with all
|
373 |
+
disallowed elements stripped out.
|
374 |
+
"""
|
375 |
+
html = self._bleach_html(html)
|
376 |
+
html = self._strip_html(html)
|
377 |
+
return html
|
378 |
+
|
379 |
+
|
380 |
+
def _bleach_html(self, html):
|
381 |
+
"""
|
382 |
+
Ensures we have valid HTML; no unclosed or mis-nested tags.
|
383 |
+
Removes any tags and attributes we don't want to let through.
|
384 |
+
Doesn't remove the contents of any disallowed tags.
|
385 |
+
|
386 |
+
Pass it an HTML string, it'll return the bleached HTML string.
|
387 |
+
"""
|
388 |
+
|
389 |
+
# Pretty much most elements, but no forms or audio/video.
|
390 |
+
allowed_tags = {
|
391 |
+
"a",
|
392 |
+
"abbr",
|
393 |
+
"acronym",
|
394 |
+
"address",
|
395 |
+
"area",
|
396 |
+
"article",
|
397 |
+
"b",
|
398 |
+
"blockquote",
|
399 |
+
"br",
|
400 |
+
"caption",
|
401 |
+
"cite",
|
402 |
+
"code",
|
403 |
+
"col",
|
404 |
+
"colgroup",
|
405 |
+
"dd",
|
406 |
+
"del",
|
407 |
+
"dfn",
|
408 |
+
"div",
|
409 |
+
"dl",
|
410 |
+
"dt",
|
411 |
+
"em",
|
412 |
+
"figcaption",
|
413 |
+
"figure",
|
414 |
+
"footer",
|
415 |
+
"h1",
|
416 |
+
"h2",
|
417 |
+
"h3",
|
418 |
+
"h4",
|
419 |
+
"h5",
|
420 |
+
"h6",
|
421 |
+
"header",
|
422 |
+
"hgroup",
|
423 |
+
"hr",
|
424 |
+
"i",
|
425 |
+
"img",
|
426 |
+
"ins",
|
427 |
+
"kbd",
|
428 |
+
"li",
|
429 |
+
"map",
|
430 |
+
"nav",
|
431 |
+
"ol",
|
432 |
+
"p",
|
433 |
+
"pre",
|
434 |
+
"q",
|
435 |
+
"s",
|
436 |
+
"samp",
|
437 |
+
"section",
|
438 |
+
"small",
|
439 |
+
"span",
|
440 |
+
"strong",
|
441 |
+
"sub",
|
442 |
+
"sup",
|
443 |
+
"table",
|
444 |
+
"tbody",
|
445 |
+
"td",
|
446 |
+
"tfoot",
|
447 |
+
"th",
|
448 |
+
"thead",
|
449 |
+
"time",
|
450 |
+
"tr",
|
451 |
+
"ul",
|
452 |
+
"var",
|
453 |
+
# We allow script and style here, so we can close/un-mis-nest
|
454 |
+
# its tags, but then it's removed completely in _strip_html():
|
455 |
+
"script",
|
456 |
+
"style",
|
457 |
+
}
|
458 |
+
|
459 |
+
# These attributes will not be removed from any of the allowed tags.
|
460 |
+
allowed_attributes = {
|
461 |
+
"*": ["class", "id"],
|
462 |
+
"a": ["href", "title"],
|
463 |
+
"abbr": ["title"],
|
464 |
+
"acronym": ["title"],
|
465 |
+
"img": ["alt", "src", "srcset"],
|
466 |
+
# Ugh. Don't know why this page doesn't use .tright like others
|
467 |
+
# http://127.0.0.1:8000/encyclopedia/5040/
|
468 |
+
"table": ["align"],
|
469 |
+
"td": ["colspan", "rowspan", "style"],
|
470 |
+
"th": ["colspan", "rowspan", "scope"],
|
471 |
+
}
|
472 |
+
|
473 |
+
# These CSS properties are allowed within style attributes
|
474 |
+
# Added for the family tree on /encyclopedia/5825/
|
475 |
+
# Hopefully doesn't make anything else too hideous.
|
476 |
+
allowed_css_properties = [
|
477 |
+
"background",
|
478 |
+
"border",
|
479 |
+
"border-bottom",
|
480 |
+
"border-collapse",
|
481 |
+
"border-left",
|
482 |
+
"border-radius",
|
483 |
+
"border-right",
|
484 |
+
"border-spacing",
|
485 |
+
"border-top",
|
486 |
+
"height",
|
487 |
+
"padding",
|
488 |
+
"text-align",
|
489 |
+
"width",
|
490 |
+
]
|
491 |
+
|
492 |
+
css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
|
493 |
+
|
494 |
+
a = bleach.clean(
|
495 |
+
html,
|
496 |
+
tags=allowed_tags,
|
497 |
+
attributes=allowed_attributes,
|
498 |
+
css_sanitizer=css_sanitizer,
|
499 |
+
strip=True,
|
500 |
+
)
|
501 |
+
|
502 |
+
return a
|
503 |
+
|
504 |
+
|
505 |
+
def _strip_html(self, html):
|
506 |
+
"""
|
507 |
+
Takes out any tags, and their contents, that we don't want at all.
|
508 |
+
And adds custom classes to existing tags (so we can apply CSS styles
|
509 |
+
without having to multiply our CSS).
|
510 |
+
|
511 |
+
Pass it an HTML string, it returns the stripped HTML string.
|
512 |
+
"""
|
513 |
+
|
514 |
+
# CSS selectors. Strip these and their contents.
|
515 |
+
selectors = [
|
516 |
+
"div.hatnote",
|
517 |
+
"div.navbar.mini", # Will also match div.mini.navbar
|
518 |
+
# Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
|
519 |
+
"div.topicon",
|
520 |
+
"a.mw-headline-anchor",
|
521 |
+
"script",
|
522 |
+
"style",
|
523 |
+
]
|
524 |
+
|
525 |
+
# Strip any element that has one of these classes.
|
526 |
+
classes = [
|
527 |
+
# "This article may be expanded with text translated from..."
|
528 |
+
# https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
|
529 |
+
"ambox-notice",
|
530 |
+
"magnify",
|
531 |
+
# eg audio on https://en.wikipedia.org/wiki/Bagpipes
|
532 |
+
"mediaContainer",
|
533 |
+
"navbox",
|
534 |
+
"noprint",
|
535 |
+
]
|
536 |
+
|
537 |
+
# Any element has a class matching a key, it will have the classes
|
538 |
+
# in the value added.
|
539 |
+
add_classes = {
|
540 |
+
# Give these tables standard Bootstrap styles.
|
541 |
+
"infobox": ["table", "table-bordered"],
|
542 |
+
"ambox": ["table", "table-bordered"],
|
543 |
+
"wikitable": ["table", "table-bordered"],
|
544 |
+
}
|
545 |
+
|
546 |
+
soup = BeautifulSoup(html, "lxml")
|
547 |
+
|
548 |
+
for selector in selectors:
|
549 |
+
_ = [tag.decompose() for tag in soup.select(selector)]
|
550 |
+
|
551 |
+
for clss in classes:
|
552 |
+
_ = [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
|
553 |
+
|
554 |
+
for clss, new_classes in add_classes.items():
|
555 |
+
for tag in soup.find_all(attrs={"class": clss}):
|
556 |
+
tag["class"] = tag.get("class", []) + new_classes
|
557 |
+
|
558 |
+
# Depending on the HTML parser BeautifulSoup used, soup may have
|
559 |
+
# surrounding <html><body></body></html> or just <body></body> tags.
|
560 |
+
if soup.body:
|
561 |
+
soup = soup.body
|
562 |
+
elif soup.html:
|
563 |
+
soup = soup.html.body
|
564 |
+
|
565 |
+
# Put the content back into a string.
|
566 |
+
html = "".join(str(tag) for tag in soup.contents)
|
567 |
+
|
568 |
+
return html
|
functions/tools.py
CHANGED
@@ -2,9 +2,7 @@
|
|
2 |
|
3 |
import time
|
4 |
import logging
|
5 |
-
import bleach
|
6 |
import requests
|
7 |
-
from bleach.css_sanitizer import CSSSanitizer
|
8 |
from smolagents import tool
|
9 |
from googlesearch import search
|
10 |
from bs4 import BeautifulSoup
|
@@ -17,7 +15,8 @@ from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
17 |
from functions.tool_helper_functions import (
|
18 |
libretext_book_parser,
|
19 |
libretext_chapter_parser,
|
20 |
-
save_libretext_book_as_markdown
|
|
|
21 |
)
|
22 |
|
23 |
# Get logger for this module
|
@@ -119,7 +118,7 @@ def get_wikipedia_page(query: str) -> str:
|
|
119 |
content = content.split(
|
120 |
'<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
|
121 |
)[0]
|
122 |
-
|
123 |
content = content.split(
|
124 |
'<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
|
125 |
)[0]
|
@@ -127,266 +126,6 @@ def get_wikipedia_page(query: str) -> str:
|
|
127 |
return content
|
128 |
|
129 |
|
130 |
-
class WikipediaFetcher:
|
131 |
-
"""Gets and cleans up Wikipedia pages."""
|
132 |
-
|
133 |
-
def fetch(self, page_name):
|
134 |
-
"""
|
135 |
-
Passed a Wikipedia page's URL fragment, like
|
136 |
-
'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
|
137 |
-
main contents, tidy the HTML, strip out any elements we don't want
|
138 |
-
and return the final HTML string.
|
139 |
-
|
140 |
-
Returns a dict with two elements:
|
141 |
-
'success' is either True or, if we couldn't fetch the page, False.
|
142 |
-
'content' is the HTML if success==True, or else an error message.
|
143 |
-
"""
|
144 |
-
result = self._get_html(page_name)
|
145 |
-
|
146 |
-
if result["success"]:
|
147 |
-
result["content"] = self._tidy_html(result["content"])
|
148 |
-
|
149 |
-
return result
|
150 |
-
|
151 |
-
|
152 |
-
def _get_html(self, page_name):
|
153 |
-
"""
|
154 |
-
Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
|
155 |
-
the HTML content (not the entire HTML page) and returns it.
|
156 |
-
|
157 |
-
Returns a dict with two elements:
|
158 |
-
'success' is either True or, if we couldn't fetch the page, False.
|
159 |
-
'content' is the HTML if success==True, or else an error message.
|
160 |
-
"""
|
161 |
-
error_message = ""
|
162 |
-
|
163 |
-
url = f"https://en.wikipedia.org/wiki/{page_name}"
|
164 |
-
|
165 |
-
try:
|
166 |
-
response = requests.get(url, params={"action": "render"}, timeout=5)
|
167 |
-
except requests.exceptions.ConnectionError:
|
168 |
-
error_message = "Can't connect to domain."
|
169 |
-
except requests.exceptions.Timeout:
|
170 |
-
error_message = "Connection timed out."
|
171 |
-
except requests.exceptions.TooManyRedirects:
|
172 |
-
error_message = "Too many redirects."
|
173 |
-
|
174 |
-
try:
|
175 |
-
response.raise_for_status()
|
176 |
-
except requests.exceptions.HTTPError:
|
177 |
-
# 4xx or 5xx errors:
|
178 |
-
error_message = f"HTTP Error: {response.status_code}"
|
179 |
-
except NameError:
|
180 |
-
if error_message == "":
|
181 |
-
error_message = "Something unusual went wrong."
|
182 |
-
|
183 |
-
if error_message:
|
184 |
-
return {"success": False, "content": error_message}
|
185 |
-
else:
|
186 |
-
return {"success": True, "content": response.text}
|
187 |
-
|
188 |
-
|
189 |
-
def _tidy_html(self, html):
|
190 |
-
"""
|
191 |
-
Passed the raw Wikipedia HTML, this returns valid HTML, with all
|
192 |
-
disallowed elements stripped out.
|
193 |
-
"""
|
194 |
-
html = self._bleach_html(html)
|
195 |
-
html = self._strip_html(html)
|
196 |
-
return html
|
197 |
-
|
198 |
-
|
199 |
-
def _bleach_html(self, html):
|
200 |
-
"""
|
201 |
-
Ensures we have valid HTML; no unclosed or mis-nested tags.
|
202 |
-
Removes any tags and attributes we don't want to let through.
|
203 |
-
Doesn't remove the contents of any disallowed tags.
|
204 |
-
|
205 |
-
Pass it an HTML string, it'll return the bleached HTML string.
|
206 |
-
"""
|
207 |
-
|
208 |
-
# Pretty much most elements, but no forms or audio/video.
|
209 |
-
allowed_tags = {
|
210 |
-
"a",
|
211 |
-
"abbr",
|
212 |
-
"acronym",
|
213 |
-
"address",
|
214 |
-
"area",
|
215 |
-
"article",
|
216 |
-
"b",
|
217 |
-
"blockquote",
|
218 |
-
"br",
|
219 |
-
"caption",
|
220 |
-
"cite",
|
221 |
-
"code",
|
222 |
-
"col",
|
223 |
-
"colgroup",
|
224 |
-
"dd",
|
225 |
-
"del",
|
226 |
-
"dfn",
|
227 |
-
"div",
|
228 |
-
"dl",
|
229 |
-
"dt",
|
230 |
-
"em",
|
231 |
-
"figcaption",
|
232 |
-
"figure",
|
233 |
-
"footer",
|
234 |
-
"h1",
|
235 |
-
"h2",
|
236 |
-
"h3",
|
237 |
-
"h4",
|
238 |
-
"h5",
|
239 |
-
"h6",
|
240 |
-
"header",
|
241 |
-
"hgroup",
|
242 |
-
"hr",
|
243 |
-
"i",
|
244 |
-
"img",
|
245 |
-
"ins",
|
246 |
-
"kbd",
|
247 |
-
"li",
|
248 |
-
"map",
|
249 |
-
"nav",
|
250 |
-
"ol",
|
251 |
-
"p",
|
252 |
-
"pre",
|
253 |
-
"q",
|
254 |
-
"s",
|
255 |
-
"samp",
|
256 |
-
"section",
|
257 |
-
"small",
|
258 |
-
"span",
|
259 |
-
"strong",
|
260 |
-
"sub",
|
261 |
-
"sup",
|
262 |
-
"table",
|
263 |
-
"tbody",
|
264 |
-
"td",
|
265 |
-
"tfoot",
|
266 |
-
"th",
|
267 |
-
"thead",
|
268 |
-
"time",
|
269 |
-
"tr",
|
270 |
-
"ul",
|
271 |
-
"var",
|
272 |
-
# We allow script and style here, so we can close/un-mis-nest
|
273 |
-
# its tags, but then it's removed completely in _strip_html():
|
274 |
-
"script",
|
275 |
-
"style",
|
276 |
-
}
|
277 |
-
|
278 |
-
# These attributes will not be removed from any of the allowed tags.
|
279 |
-
allowed_attributes = {
|
280 |
-
"*": ["class", "id"],
|
281 |
-
"a": ["href", "title"],
|
282 |
-
"abbr": ["title"],
|
283 |
-
"acronym": ["title"],
|
284 |
-
"img": ["alt", "src", "srcset"],
|
285 |
-
# Ugh. Don't know why this page doesn't use .tright like others
|
286 |
-
# http://127.0.0.1:8000/encyclopedia/5040/
|
287 |
-
"table": ["align"],
|
288 |
-
"td": ["colspan", "rowspan", "style"],
|
289 |
-
"th": ["colspan", "rowspan", "scope"],
|
290 |
-
}
|
291 |
-
|
292 |
-
# These CSS properties are allowed within style attributes
|
293 |
-
# Added for the family tree on /encyclopedia/5825/
|
294 |
-
# Hopefully doesn't make anything else too hideous.
|
295 |
-
allowed_css_properties = [
|
296 |
-
"background",
|
297 |
-
"border",
|
298 |
-
"border-bottom",
|
299 |
-
"border-collapse",
|
300 |
-
"border-left",
|
301 |
-
"border-radius",
|
302 |
-
"border-right",
|
303 |
-
"border-spacing",
|
304 |
-
"border-top",
|
305 |
-
"height",
|
306 |
-
"padding",
|
307 |
-
"text-align",
|
308 |
-
"width",
|
309 |
-
]
|
310 |
-
|
311 |
-
css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
|
312 |
-
|
313 |
-
a = bleach.clean(
|
314 |
-
html,
|
315 |
-
tags=allowed_tags,
|
316 |
-
attributes=allowed_attributes,
|
317 |
-
css_sanitizer=css_sanitizer,
|
318 |
-
strip=True,
|
319 |
-
)
|
320 |
-
|
321 |
-
return a
|
322 |
-
|
323 |
-
|
324 |
-
def _strip_html(self, html):
|
325 |
-
"""
|
326 |
-
Takes out any tags, and their contents, that we don't want at all.
|
327 |
-
And adds custom classes to existing tags (so we can apply CSS styles
|
328 |
-
without having to multiply our CSS).
|
329 |
-
|
330 |
-
Pass it an HTML string, it returns the stripped HTML string.
|
331 |
-
"""
|
332 |
-
|
333 |
-
# CSS selectors. Strip these and their contents.
|
334 |
-
selectors = [
|
335 |
-
"div.hatnote",
|
336 |
-
"div.navbar.mini", # Will also match div.mini.navbar
|
337 |
-
# Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
|
338 |
-
"div.topicon",
|
339 |
-
"a.mw-headline-anchor",
|
340 |
-
"script",
|
341 |
-
"style",
|
342 |
-
]
|
343 |
-
|
344 |
-
# Strip any element that has one of these classes.
|
345 |
-
classes = [
|
346 |
-
# "This article may be expanded with text translated from..."
|
347 |
-
# https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
|
348 |
-
"ambox-notice",
|
349 |
-
"magnify",
|
350 |
-
# eg audio on https://en.wikipedia.org/wiki/Bagpipes
|
351 |
-
"mediaContainer",
|
352 |
-
"navbox",
|
353 |
-
"noprint",
|
354 |
-
]
|
355 |
-
|
356 |
-
# Any element has a class matching a key, it will have the classes
|
357 |
-
# in the value added.
|
358 |
-
add_classes = {
|
359 |
-
# Give these tables standard Bootstrap styles.
|
360 |
-
"infobox": ["table", "table-bordered"],
|
361 |
-
"ambox": ["table", "table-bordered"],
|
362 |
-
"wikitable": ["table", "table-bordered"],
|
363 |
-
}
|
364 |
-
|
365 |
-
soup = BeautifulSoup(html, "lxml")
|
366 |
-
|
367 |
-
for selector in selectors:
|
368 |
-
_ = [tag.decompose() for tag in soup.select(selector)]
|
369 |
-
|
370 |
-
for clss in classes:
|
371 |
-
_ = [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
|
372 |
-
|
373 |
-
for clss, new_classes in add_classes.items():
|
374 |
-
for tag in soup.find_all(attrs={"class": clss}):
|
375 |
-
tag["class"] = tag.get("class", []) + new_classes
|
376 |
-
|
377 |
-
# Depending on the HTML parser BeautifulSoup used, soup may have
|
378 |
-
# surrounding <html><body></body></html> or just <body></body> tags.
|
379 |
-
if soup.body:
|
380 |
-
soup = soup.body
|
381 |
-
elif soup.html:
|
382 |
-
soup = soup.html.body
|
383 |
-
|
384 |
-
# Put the content back into a string.
|
385 |
-
html = "".join(str(tag) for tag in soup.contents)
|
386 |
-
|
387 |
-
return html
|
388 |
-
|
389 |
-
|
390 |
@tool
|
391 |
def libretext_book_search(query: str) -> dict:
|
392 |
"""
|
|
|
2 |
|
3 |
import time
|
4 |
import logging
|
|
|
5 |
import requests
|
|
|
6 |
from smolagents import tool
|
7 |
from googlesearch import search
|
8 |
from bs4 import BeautifulSoup
|
|
|
15 |
from functions.tool_helper_functions import (
|
16 |
libretext_book_parser,
|
17 |
libretext_chapter_parser,
|
18 |
+
save_libretext_book_as_markdown,
|
19 |
+
WikipediaFetcher
|
20 |
)
|
21 |
|
22 |
# Get logger for this module
|
|
|
118 |
content = content.split(
|
119 |
'<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
|
120 |
)[0]
|
121 |
+
|
122 |
content = content.split(
|
123 |
'<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
|
124 |
)[0]
|
|
|
126 |
return content
|
127 |
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
@tool
|
130 |
def libretext_book_search(query: str) -> dict:
|
131 |
"""
|
results.csv
CHANGED
@@ -2,14 +2,14 @@ Task ID,Question,Submitted Answer
|
|
2 |
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,3
|
3 |
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",right
|
4 |
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,FunkMonk
|
5 |
-
cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,
|
6 |
3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
|
7 |
|
8 |
milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
|
9 |
|
10 |
-
I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","broccoli, celery, green beans, lettuce, sweet potatoes, zucchini"
|
11 |
305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,Wojciech
|
12 |
-
3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,
|
13 |
cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",CUB
|
14 |
a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","Yamasaki, Uehara"
|
15 |
-
5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,Claus Peter
|
|
|
2 |
8e867cd7-cff9-4e6c-867a-ff5ddc2550be,How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.,3
|
3 |
2d83110e-a098-4ebb-9987-066c06fa42d0,".rewsna eht sa ""tfel"" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",right
|
4 |
4fc2f1ae-8625-45b5-ab34-ad4433bc21f8,Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?,FunkMonk
|
5 |
+
cabe07ed-9eca-40ea-8ead-410ef5e83f91,What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?,No equine veterinarian mentioned in 1.E Exercises
|
6 |
3cef3a44-215e-4aed-8e3b-b1e3f08063b7,"I'm making a grocery list for my mom, but she's a professor of botany and she's a real stickler when it comes to categorizing things. I need to add different foods to different categories on the grocery list, but if I make a mistake, she won't buy anything inserted in the wrong category. Here's the list I have so far:
|
7 |
|
8 |
milk, eggs, flour, whole bean coffee, Oreos, sweet potatoes, fresh basil, plums, green beans, rice, corn, bell pepper, whole allspice, acorns, broccoli, celery, zucchini, lettuce, peanuts
|
9 |
|
10 |
+
I need to make headings for the fruits and vegetables. Could you please create a list of just the vegetables from my list? If you could do that, then I can figure out how to categorize the rest of the list into the appropriate categories. But remember that my mom is a real stickler, so make sure that no botanical fruits end up on the vegetable list, or she won't get them when she's at the store. Please alphabetize the list of vegetables, and place each item in a comma separated list.","bell pepper, broccoli, celery, corn, green beans, lettuce, sweet potatoes, zucchini"
|
11 |
305ac316-eef6-4446-960a-92d80d542f82,Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.? Give only the first name.,Wojciech
|
12 |
+
3f57289b-8c60-48be-bd80-01f8099ca449,How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?,525
|
13 |
cf106601-ab4f-4af9-b045-5295fe67b37d,"What country had the least number of athletes at the 1928 Summer Olympics? If there's a tie for a number of athletes, return the first in alphabetical order. Give the IOC country code as your answer.",CUB
|
14 |
a0c07678-e491-4bbc-8f0b-07405144218f,"Who are the pitchers with the number before and after Taishō Tamai's number as of July 2023? Give them to me in the form Pitcher Before, Pitcher After, use their last names only, in Roman characters.","Yamasaki, Uehara"
|
15 |
+
5a0c1adf-205e-4841-a666-7c3ef95def9d,What is the first name of the only Malko Competition recipient from the 20th Century (after 1977) whose nationality on record is a country that no longer exists?,Claus Peter Flor
|