unit-four-final-project

Sleeping

App Files Files Community

gperdrizet commited on Jun 26

Commit

8b358c4

verified ·

1 Parent(s): 503c07a

Updated wikipedia search tools to get and parse wikipedia pages to HTML so that tables and other non-text elements are visible to agent. Allowed agent to import BeautifulSoup.

Browse files

Files changed (6) hide show

app.py +5 -5
configuration.py +6 -1
functions/agent.py +23 -10
functions/tools.py +345 -4
requirements.txt +5 -2
tests/test_tools.py +59 -4

app.py CHANGED Viewed

@@ -12,10 +12,10 @@ import pandas as pd
 from functions.agent import create_agent
 # --- Constants ---
-from configuration import DEFAULT_API_URL, INSTRUCTIONS
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
@@ -79,7 +79,8 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
     print(f'Running agent on {len(questions_data)} questions...')
-    for item in questions_data:
         task_id = item.get("task_id")
         question_text = item.get("question")
@@ -89,8 +90,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
         try:
             submitted_answer = agent.run(
-                INSTRUCTIONS,
-                additional_args={'user_prompt': question_text}
             )
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})

 from functions.agent import create_agent
 # --- Constants ---
+from configuration import QUESTIONS, DEFAULT_API_URL, INSTRUCTIONS
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
     print(f'Running agent on {len(questions_data)} questions...')
+    for question_number in QUESTIONS:
+        item = questions_data[question_number - 1]  # Adjust for zero-based index
         task_id = item.get("task_id")
         question_text = item.get("question")
         try:
             submitted_answer = agent.run(
+                question_text
             )
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})

configuration.py CHANGED Viewed

@@ -5,8 +5,13 @@ Contains API URLs and agent instructions used throughout the application.
 """
 # pylint: disable=line-too-long
 DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
 INSTRUCTIONS = """
-YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """

 """
 # pylint: disable=line-too-long
+# Which questions to answer
+QUESTIONS = [1]
+# GAIA benchmark scoring API
 DEFAULT_API_URL = 'https://agents-course-unit4-scoring.hf.space'
+# Additional instructions for agent. See here: https://huggingface.co/spaces/gaia-benchmark/leaderboard
 INSTRUCTIONS = """
+You are a general AI assistant. I will ask you a question. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 """

functions/agent.py CHANGED Viewed

@@ -1,24 +1,37 @@
 '''Agent definition for GAIA question answering system.'''
 # Imports for agent creation
-from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool, Tool
-from langchain_community.agent_toolkits.load_tools import load_tools
-from functions.tools import google_search
 def create_agent():
     '''Creates agent for GAIA question answering system.'''
-    wikipedia = Tool.from_langchain(
-        load_tools(["wikipedia"])[0]
-    )
     model = InferenceClientModel(
-        "Qwen/Qwen2.5-Coder-32B-Instruct"
     )
     agent = CodeAgent(
-        tools=[wikipedia, google_search, VisitWebpageTool()],
-        model=model
     )
     return agent

 '''Agent definition for GAIA question answering system.'''
 # Imports for agent creation
+from smolagents import CodeAgent, InferenceClientModel, VisitWebpageTool
+from functions.tools import (
+    google_search,
+    wikipedia_search,
+    get_wikipedia_page
+)
 def create_agent():
     '''Creates agent for GAIA question answering system.'''
     model = InferenceClientModel(
+        # max_tokens=8096,
+        # temperature=0.5,
+        model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
+        provider='together'
+        # custom_role_conversions=None
     )
+    tools = [
+        google_search,
+        wikipedia_search,
+        get_wikipedia_page,
+        VisitWebpageTool()
+    ]
     agent = CodeAgent(
+        tools=tools,
+        model=model,
+        max_steps=20,
+        planning_interval=2,
+        additional_authorized_imports=['bs4.*']
     )
     return agent

functions/tools.py CHANGED Viewed

@@ -1,10 +1,14 @@
 '''Tools for GAIA question answering agent.'''
 from smolagents import tool
 from googlesearch import search
 @tool
-def google_search(query: str) -> str:
     """
     Perform a Google search and return the top 10 results.
@@ -12,9 +16,346 @@ def google_search(query: str) -> str:
         query (str): The search query.
     Returns:
-        str: The URLs of the top search results, separated by newlines.
     """
-    results = list(search(query, num_results=10, advanced=True))
-    return results

 '''Tools for GAIA question answering agent.'''
+import bleach
+import requests
+from bleach.css_sanitizer import CSSSanitizer
+from bs4 import BeautifulSoup
 from smolagents import tool
 from googlesearch import search
 @tool
+def google_search(query: str) -> dict:
     """
     Perform a Google search and return the top 10 results.
         query (str): The search query.
     Returns:
+        dict: A dictionary containing the search results in the following format.
+        {0: {'title': str, 'url': str, 'description': str}, ...}
     """
+    # Run the query
+    results = list(search(query, num_results=5, advanced=True))
+    # Parse and format the results
+    parsed_results = {}
+    for i, result in enumerate(results):
+        parsed_results[i] = {
+            'title': result.title,
+            'url': result.url,
+            'description': result.description
+        }
+    return parsed_results
+@tool
+def wikipedia_search(query: str) -> dict:
+    """
+    Perform a search for wikipedia pages and return the top 5 results.
+    Args:
+        query (str): The search query.
+    Returns:
+        dict: A dictionary containing the search results in the following format.
+        {0: {'title': str, 'description': str}, ...}
+    """
+    language_code = 'en'
+    number_of_results = 5
+    headers = {
+        'User-Agent': 'HuggingFace Agents course final project (https://github.com/gperdrizet/unit-four-final-project)'
+    }
+    base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
+    endpoint = '/search/page'
+    url = base_url + language_code + endpoint
+    parameters = {'q': query, 'limit': number_of_results}
+    response = requests.get(url, headers=headers, params=parameters, timeout=15)
+    if response.status_code == 200:
+        results = response.json().get('pages', [])
+        parsed_results = {}
+    else:
+        return f"Error: Unable to retrieve page. Status code {response.status_code}"
+    for i, result in enumerate(results):
+        parsed_results[i] = {
+            'title': result.get('title', None),
+            'description': result.get('description', None)
+        }
+    return parsed_results
+@tool
+def get_wikipedia_page(query: str) -> str:
+    """
+    Get the content of a Wikipedia page as HTML.
+    Args:
+        query (str): The title of the Wikipedia page.
+    Returns:
+        str: The HTML content of the Wikipedia page.
+    """
+    fetcher = WikipediaFetcher()
+    html_result = fetcher.fetch(query.replace(' ', '_'))
+    content = html_result['content']
+    content = content.split('<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>')[0]
+    content = content.split('<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>')[0]
+    return content
+class WikipediaFetcher:
+    """Gets and cleans up Wikipedia pages."""
+    def fetch(self, page_name):
+        """
+        Passed a Wikipedia page's URL fragment, like
+        'Edward_Montagu,_1st_Earl_of_Sandwich', this will fetch the page's
+        main contents, tidy the HTML, strip out any elements we don't want
+        and return the final HTML string.
+        Returns a dict with two elements:
+            'success' is either True or, if we couldn't fetch the page, False.
+            'content' is the HTML if success==True, or else an error message.
+        """
+        result = self._get_html(page_name)
+        if result["success"]:
+            result["content"] = self._tidy_html(result["content"])
+        return result
+    def _get_html(self, page_name):
+        """
+        Passed the name of a Wikipedia page (eg, 'Samuel_Pepys'), it fetches
+        the HTML content (not the entire HTML page) and returns it.
+        Returns a dict with two elements:
+            'success' is either True or, if we couldn't fetch the page, False.
+            'content' is the HTML if success==True, or else an error message.
+        """
+        error_message = ""
+        url = f"https://en.wikipedia.org/wiki/{page_name}"
+        try:
+            response = requests.get(url, params={"action": "render"}, timeout=5)
+        except requests.exceptions.ConnectionError:
+            error_message = "Can't connect to domain."
+        except requests.exceptions.Timeout:
+            error_message = "Connection timed out."
+        except requests.exceptions.TooManyRedirects:
+            error_message = "Too many redirects."
+        try:
+            response.raise_for_status()
+        except requests.exceptions.HTTPError:
+            # 4xx or 5xx errors:
+            error_message = f"HTTP Error: {response.status_code}"
+        except NameError:
+            if error_message == "":
+                error_message = "Something unusual went wrong."
+        if error_message:
+            return {"success": False, "content": error_message}
+        else:
+            return {"success": True, "content": response.text}
+    def _tidy_html(self, html):
+        """
+        Passed the raw Wikipedia HTML, this returns valid HTML, with all
+        disallowed elements stripped out.
+        """
+        html = self._bleach_html(html)
+        html = self._strip_html(html)
+        return html
+    def _bleach_html(self, html):
+        """
+        Ensures we have valid HTML; no unclosed or mis-nested tags.
+        Removes any tags and attributes we don't want to let through.
+        Doesn't remove the contents of any disallowed tags.
+        Pass it an HTML string, it'll return the bleached HTML string.
+        """
+        # Pretty much most elements, but no forms or audio/video.
+        allowed_tags = {
+            "a",
+            "abbr",
+            "acronym",
+            "address",
+            "area",
+            "article",
+            "b",
+            "blockquote",
+            "br",
+            "caption",
+            "cite",
+            "code",
+            "col",
+            "colgroup",
+            "dd",
+            "del",
+            "dfn",
+            "div",
+            "dl",
+            "dt",
+            "em",
+            "figcaption",
+            "figure",
+            "footer",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "header",
+            "hgroup",
+            "hr",
+            "i",
+            "img",
+            "ins",
+            "kbd",
+            "li",
+            "map",
+            "nav",
+            "ol",
+            "p",
+            "pre",
+            "q",
+            "s",
+            "samp",
+            "section",
+            "small",
+            "span",
+            "strong",
+            "sub",
+            "sup",
+            "table",
+            "tbody",
+            "td",
+            "tfoot",
+            "th",
+            "thead",
+            "time",
+            "tr",
+            "ul",
+            "var",
+            # We allow script and style here, so we can close/un-mis-nest
+            # its tags, but then it's removed completely in _strip_html():
+            "script",
+            "style",
+        }
+        # These attributes will not be removed from any of the allowed tags.
+        allowed_attributes = {
+            "*": ["class", "id"],
+            "a": ["href", "title"],
+            "abbr": ["title"],
+            "acronym": ["title"],
+            "img": ["alt", "src", "srcset"],
+            # Ugh. Don't know why this page doesn't use .tright like others
+            # http://127.0.0.1:8000/encyclopedia/5040/
+            "table": ["align"],
+            "td": ["colspan", "rowspan", "style"],
+            "th": ["colspan", "rowspan", "scope"],
+        }
+        # These CSS properties are allowed within style attributes
+        # Added for the family tree on /encyclopedia/5825/
+        # Hopefully doesn't make anything else too hideous.
+        allowed_css_properties = [
+            "background",
+            "border",
+            "border-bottom",
+            "border-collapse",
+            "border-left",
+            "border-radius",
+            "border-right",
+            "border-spacing",
+            "border-top",
+            "height",
+            "padding",
+            "text-align",
+            "width",
+        ]
+        css_sanitizer = CSSSanitizer(allowed_css_properties=allowed_css_properties)
+        a = bleach.clean(
+            html,
+            tags=allowed_tags,
+            attributes=allowed_attributes,
+            css_sanitizer=css_sanitizer,
+            strip=True,
+        )
+        return a
+    def _strip_html(self, html):
+        """
+        Takes out any tags, and their contents, that we don't want at all.
+        And adds custom classes to existing tags (so we can apply CSS styles
+        without having to multiply our CSS).
+        Pass it an HTML string, it returns the stripped HTML string.
+        """
+        # CSS selectors. Strip these and their contents.
+        selectors = [
+            "div.hatnote",
+            "div.navbar.mini",  # Will also match div.mini.navbar
+            # Bottom of https://en.wikipedia.org/wiki/Charles_II_of_England :
+            "div.topicon",
+            "a.mw-headline-anchor",
+            "script",
+            "style",
+        ]
+        # Strip any element that has one of these classes.
+        classes = [
+            # "This article may be expanded with text translated from..."
+            # https://en.wikipedia.org/wiki/Afonso_VI_of_Portugal
+            "ambox-notice",
+            "magnify",
+            # eg audio on https://en.wikipedia.org/wiki/Bagpipes
+            "mediaContainer",
+            "navbox",
+            "noprint",
+        ]
+        # Any element has a class matching a key, it will have the classes
+        # in the value added.
+        add_classes = {
+            # Give these tables standard Bootstrap styles.
+            "infobox": ["table", "table-bordered"],
+            "ambox": ["table", "table-bordered"],
+            "wikitable": ["table", "table-bordered"],
+        }
+        soup = BeautifulSoup(html, "lxml")
+        for selector in selectors:
+            [tag.decompose() for tag in soup.select(selector)]
+        for clss in classes:
+            [tag.decompose() for tag in soup.find_all(attrs={"class": clss})]
+        for clss, new_classes in add_classes.items():
+            for tag in soup.find_all(attrs={"class": clss}):
+                tag["class"] = tag.get("class", []) + new_classes
+        # Depending on the HTML parser BeautifulSoup used, soup may have
+        # surrounding <html><body></body></html> or just <body></body> tags.
+        if soup.body:
+            soup = soup.body
+        elif soup.html:
+            soup = soup.html.body
+        # Put the content back into a string.
+        html = "".join(str(tag) for tag in soup.contents)
+        return html

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 duckduckgo-search
 googlesearch-python
 gradio[oauth]
-langchain-community
 markdownify
 requests
-smolagents

+bleach
 duckduckgo-search
 googlesearch-python
 gradio[oauth]
 markdownify
+mwparserfromhell
 requests
+smolagents
+tinycss2
+wikipedia-api

tests/test_tools.py CHANGED Viewed

@@ -1,8 +1,11 @@
 '''Unittests for agent tools.'''
 import unittest
-import googlesearch
-from functions.tools import google_search
 class TestGoogleSearch(unittest.TestCase):
@@ -30,8 +33,6 @@ class TestGoogleSearch(unittest.TestCase):
     def test_result_content(self):
         '''Each search result should contain three elements: title, link, and snippet.'''
-        print(type(self.search_results[1]))
         for _, result in self.search_results.items():
             self.assertIsInstance(result, dict)
             self.assertIn('title', result)
@@ -40,3 +41,57 @@ class TestGoogleSearch(unittest.TestCase):
             self.assertIsInstance(result['title'], str)
             self.assertIsInstance(result['url'], str)
             self.assertIsInstance(result['description'], str)

 '''Unittests for agent tools.'''
 import unittest
+from functions.tools import (
+    google_search,
+    wikipedia_search,
+    get_wikipedia_page
+)
 class TestGoogleSearch(unittest.TestCase):
     def test_result_content(self):
         '''Each search result should contain three elements: title, link, and snippet.'''
         for _, result in self.search_results.items():
             self.assertIsInstance(result, dict)
             self.assertIn('title', result)
             self.assertIsInstance(result['title'], str)
             self.assertIsInstance(result['url'], str)
             self.assertIsInstance(result['description'], str)
+class TestWikipediaSearch(unittest.TestCase):
+    '''Tests for the wikipedia search tool.'''
+    def setUp(self):
+        wikipedia_search_query = 'Python programming language'
+        self.search_results = wikipedia_search(wikipedia_search_query)
+    def test_result_type(self):
+        '''Search results should be a dictionary.'''
+        self.assertIsInstance(self.search_results, dict)
+    def test_result_length(self):
+        '''Search results should contain 5 items.'''
+        self.assertEqual(len(self.search_results), 5)
+    def test_result_content(self):
+        '''Each search result should contain three elements: title, link, and snippet.'''
+        for _, result in self.search_results.items():
+            self.assertIsInstance(result, dict)
+            self.assertIn('title', result)
+            self.assertIn('description', result)
+            self.assertIsInstance(result['title'], str)
+            self.assertIsInstance(result['description'], str)
+class TestGetWikipediaPage(unittest.TestCase):
+    '''Tests for the get_wikipedia_page tool.'''
+    def setUp(self):
+        self.page_content = get_wikipedia_page('Mercedes Sosa')
+    def test_page_content_type(self):
+        '''Page content should be a string.'''
+        self.assertIsInstance(self.page_content, str)
+    def test_page_content_not_empty(self):
+        '''Page content should not be empty.'''
+        self.assertTrue(len(self.page_content) > 0)