Spaces:

poscye
/

ddg-web-search-chat

Running on Zero

App Files Files Community

Lumpen1 commited on May 25, 2024

Commit

cae1372

1 Parent(s): eea59e2

Web search update with summarization.

Browse files

Files changed (5) hide show

app.py +3 -46
default_web_crawlers.py +30 -0
default_web_search_providers.py +10 -0
web_search.py +51 -0
web_search_interfaces.py +15 -0

app.py CHANGED Viewed

@@ -10,8 +10,7 @@ from llama_cpp_agent.chat_history import BasicChatHistory
 from llama_cpp_agent.chat_history.messages import Roles
 from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
 from huggingface_hub import hf_hub_download
-from duckduckgo_search import DDGS
-from trafilatura import fetch_url, extract
 model_selected = "Mistral-7B-Instruct-v0.3-Q6_K.gguf"
 examples = [
@@ -93,48 +92,6 @@ def get_context_by_model(model_name):
     }
     return model_context_limits.get(model_name, None)
-def get_website_content_from_url(url: str) -> str:
-    """
-    Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
-    Args:
-        url (str): URL to get website content from.
-    Returns:
-        str: Extracted content including title, main text, and tables.
-    """
-    try:
-        downloaded = fetch_url(url)
-        result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
-        if result:
-            result = json.loads(result)
-            return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
-        else:
-            return ""
-    except Exception as e:
-        return f"An error occurred: {str(e)}"
-def search_web(search_query: str):
-    """
-    Search the web for information.
-    Args:
-        search_query (str): Search query to search for.
-    """
-    results = DDGS().text(search_query, region='wt-wt', safesearch='off', timelimit='y', max_results=3)
-    result_string = ''
-    for res in results:
-        web_info = get_website_content_from_url(res['href'])
-        if web_info != "":
-            result_string += web_info
-    res = result_string.strip()
-    return "Based on the following results, answer the previous user query:\nResults:\n\n" + res[:get_context_by_model(model_selected)]
 def get_messages_formatter_type(model_name):
     from llama_cpp_agent import MessagesFormatterType
     if "Meta" in model_name or "aya" in model_name:
@@ -189,7 +146,7 @@ def respond(
         predefined_messages_formatter_type=chat_template,
         debug_output=True
     )
     settings = provider.get_provider_default_settings()
     settings.temperature = temperature
     settings.top_k = top_k
@@ -198,7 +155,7 @@ def respond(
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
     output_settings = LlmStructuredOutputSettings.from_functions(
-        [search_web, write_message_to_user])
     messages = BasicChatHistory()
     for msn in history:

 from llama_cpp_agent.chat_history.messages import Roles
 from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
 from huggingface_hub import hf_hub_download
+from web_search import WebSearchTool
 model_selected = "Mistral-7B-Instruct-v0.3-Q6_K.gguf"
 examples = [
     }
     return model_context_limits.get(model_name, None)
 def get_messages_formatter_type(model_name):
     from llama_cpp_agent import MessagesFormatterType
     if "Meta" in model_name or "aya" in model_name:
         predefined_messages_formatter_type=chat_template,
         debug_output=True
     )
+    search_tool = WebSearchTool(provider, chat_template, get_context_by_model(model))
     settings = provider.get_provider_default_settings()
     settings.temperature = temperature
     settings.top_k = top_k
     settings.repeat_penalty = repeat_penalty
     settings.stream = True
     output_settings = LlmStructuredOutputSettings.from_functions(
+        [search_tool.get_tool(), write_message_to_user])
     messages = BasicChatHistory()
     for msn in history:

default_web_crawlers.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import json
+from web_search_interfaces import WebCrawler
+from trafilatura import fetch_url, extract
+class TrafilaturaWebCrawler(WebCrawler):
+    def get_website_content_from_url(self, url: str) -> str:
+        """
+        Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
+        Args:
+            url (str): URL to get website content from.
+        Returns:
+            str: Extracted content including title, main text, and tables.
+        """
+        try:
+            downloaded = fetch_url(url)
+            result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
+            if result:
+                result = json.loads(result)
+                return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
+            else:
+                return ""
+        except Exception as e:
+            return f"An error occurred: {str(e)}"

default_web_search_providers.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from duckduckgo_search import DDGS
+from web_search_interfaces import WebSearchProvider
+class DDGWebSearchProvider(WebSearchProvider):
+    def search_web(self, search_query: str):
+        results = DDGS().text(search_query, region='wt-wt', safesearch='off', max_results=4)
+        return [res["href"] for res in results]

web_search.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
+from llama_cpp_agent.chat_history.messages import Roles
+from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
+from llama_cpp_agent.providers import LlamaCppServerProvider
+from llama_cpp_agent.providers.provider_base import LlmProvider
+from web_search_interfaces import WebCrawler, WebSearchProvider
+from default_web_crawlers import TrafilaturaWebCrawler
+from default_web_search_providers import DDGWebSearchProvider
+class WebSearchTool:
+    def __init__(self, llm_provider: LlmProvider, message_formatter_type: MessagesFormatterType, context_character_limit: int = 7500,
+                 web_crawler: WebCrawler = None, web_search_provider: WebSearchProvider = None):
+        self.summarising_agent = LlamaCppAgent(llm_provider, debug_output=True,
+                                               system_prompt="You are a text summarization and information extraction specialist and you are able to summarize and filter out information relevant to a specific query.",
+                                               predefined_messages_formatter_type=message_formatter_type)
+        if web_crawler is None:
+            self.web_crawler = TrafilaturaWebCrawler()
+        else:
+            self.web_crawler = web_crawler
+        if web_search_provider is None:
+            self.web_search_provider = DDGWebSearchProvider()
+        else:
+            self.web_search_provider = web_search_provider
+        self.context_character_limit = context_character_limit
+    def search_web(self, search_query: str):
+        """
+        Search the web for information.
+        Args:
+            search_query (str): Search query to search for.
+        """
+        results = self.web_search_provider.search_web(search_query)
+        result_string = ''
+        for res in results:
+            web_info = self.web_crawler.get_website_content_from_url(res)
+            if web_info != "":
+                web_info = self.summarising_agent.get_chat_response(
+                    f"Please summarize the following Website content and extract relevant information to this query:'{search_query}'.\n\n" + web_info,
+                    add_response_to_chat_history=False, add_message_to_chat_history=False)
+                result_string += web_info
+        res = result_string.strip()
+        return "Based on the following results, answer the previous user query:\nResults:\n\n" + res[:self.context_character_limit]
+    def get_tool(self):
+        return self.search_web

web_search_interfaces.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import abc
+class WebCrawler(abc.ABC):
+    @abc.abstractmethod
+    def get_website_content_from_url(self, url: str):
+        """Get the website content from an url."""
+        pass
+class WebSearchProvider(abc.ABC):
+    @abc.abstractmethod
+    def search_web(self, query: str):
+        """Searches the web and returns a list of urls of the result"""
+        pass