Lumpen1 commited on
Commit
cae1372
·
1 Parent(s): eea59e2

Web search update with summarization.

Browse files
app.py CHANGED
@@ -10,8 +10,7 @@ from llama_cpp_agent.chat_history import BasicChatHistory
10
  from llama_cpp_agent.chat_history.messages import Roles
11
  from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
12
  from huggingface_hub import hf_hub_download
13
- from duckduckgo_search import DDGS
14
- from trafilatura import fetch_url, extract
15
 
16
  model_selected = "Mistral-7B-Instruct-v0.3-Q6_K.gguf"
17
  examples = [
@@ -93,48 +92,6 @@ def get_context_by_model(model_name):
93
  }
94
  return model_context_limits.get(model_name, None)
95
 
96
- def get_website_content_from_url(url: str) -> str:
97
- """
98
- Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
99
-
100
- Args:
101
- url (str): URL to get website content from.
102
-
103
- Returns:
104
- str: Extracted content including title, main text, and tables.
105
- """
106
-
107
- try:
108
- downloaded = fetch_url(url)
109
-
110
- result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
111
-
112
- if result:
113
- result = json.loads(result)
114
- return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
115
- else:
116
- return ""
117
- except Exception as e:
118
- return f"An error occurred: {str(e)}"
119
-
120
-
121
- def search_web(search_query: str):
122
- """
123
- Search the web for information.
124
- Args:
125
- search_query (str): Search query to search for.
126
- """
127
- results = DDGS().text(search_query, region='wt-wt', safesearch='off', timelimit='y', max_results=3)
128
- result_string = ''
129
- for res in results:
130
- web_info = get_website_content_from_url(res['href'])
131
- if web_info != "":
132
- result_string += web_info
133
-
134
- res = result_string.strip()
135
- return "Based on the following results, answer the previous user query:\nResults:\n\n" + res[:get_context_by_model(model_selected)]
136
-
137
-
138
  def get_messages_formatter_type(model_name):
139
  from llama_cpp_agent import MessagesFormatterType
140
  if "Meta" in model_name or "aya" in model_name:
@@ -189,7 +146,7 @@ def respond(
189
  predefined_messages_formatter_type=chat_template,
190
  debug_output=True
191
  )
192
-
193
  settings = provider.get_provider_default_settings()
194
  settings.temperature = temperature
195
  settings.top_k = top_k
@@ -198,7 +155,7 @@ def respond(
198
  settings.repeat_penalty = repeat_penalty
199
  settings.stream = True
200
  output_settings = LlmStructuredOutputSettings.from_functions(
201
- [search_web, write_message_to_user])
202
  messages = BasicChatHistory()
203
 
204
  for msn in history:
 
10
  from llama_cpp_agent.chat_history.messages import Roles
11
  from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
12
  from huggingface_hub import hf_hub_download
13
+ from web_search import WebSearchTool
 
14
 
15
  model_selected = "Mistral-7B-Instruct-v0.3-Q6_K.gguf"
16
  examples = [
 
92
  }
93
  return model_context_limits.get(model_name, None)
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  def get_messages_formatter_type(model_name):
96
  from llama_cpp_agent import MessagesFormatterType
97
  if "Meta" in model_name or "aya" in model_name:
 
146
  predefined_messages_formatter_type=chat_template,
147
  debug_output=True
148
  )
149
+ search_tool = WebSearchTool(provider, chat_template, get_context_by_model(model))
150
  settings = provider.get_provider_default_settings()
151
  settings.temperature = temperature
152
  settings.top_k = top_k
 
155
  settings.repeat_penalty = repeat_penalty
156
  settings.stream = True
157
  output_settings = LlmStructuredOutputSettings.from_functions(
158
+ [search_tool.get_tool(), write_message_to_user])
159
  messages = BasicChatHistory()
160
 
161
  for msn in history:
default_web_crawlers.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from web_search_interfaces import WebCrawler
4
+ from trafilatura import fetch_url, extract
5
+
6
+
7
+ class TrafilaturaWebCrawler(WebCrawler):
8
+ def get_website_content_from_url(self, url: str) -> str:
9
+ """
10
+ Get website content from a URL using Selenium and BeautifulSoup for improved content extraction and filtering.
11
+
12
+ Args:
13
+ url (str): URL to get website content from.
14
+
15
+ Returns:
16
+ str: Extracted content including title, main text, and tables.
17
+ """
18
+
19
+ try:
20
+ downloaded = fetch_url(url)
21
+
22
+ result = extract(downloaded, include_formatting=True, include_links=True, output_format='json', url=url)
23
+
24
+ if result:
25
+ result = json.loads(result)
26
+ return f'=========== Website Title: {result["title"]} ===========\n\n=========== Website URL: {url} ===========\n\n=========== Website Content ===========\n\n{result["raw_text"]}\n\n=========== Website Content End ===========\n\n'
27
+ else:
28
+ return ""
29
+ except Exception as e:
30
+ return f"An error occurred: {str(e)}"
default_web_search_providers.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from duckduckgo_search import DDGS
2
+
3
+ from web_search_interfaces import WebSearchProvider
4
+
5
+
6
+ class DDGWebSearchProvider(WebSearchProvider):
7
+
8
+ def search_web(self, search_query: str):
9
+ results = DDGS().text(search_query, region='wt-wt', safesearch='off', max_results=4)
10
+ return [res["href"] for res in results]
web_search.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
2
+ from llama_cpp_agent.chat_history.messages import Roles
3
+ from llama_cpp_agent.llm_output_settings import LlmStructuredOutputSettings
4
+ from llama_cpp_agent.providers import LlamaCppServerProvider
5
+ from llama_cpp_agent.providers.provider_base import LlmProvider
6
+ from web_search_interfaces import WebCrawler, WebSearchProvider
7
+ from default_web_crawlers import TrafilaturaWebCrawler
8
+ from default_web_search_providers import DDGWebSearchProvider
9
+
10
+
11
+ class WebSearchTool:
12
+
13
+ def __init__(self, llm_provider: LlmProvider, message_formatter_type: MessagesFormatterType, context_character_limit: int = 7500,
14
+ web_crawler: WebCrawler = None, web_search_provider: WebSearchProvider = None):
15
+ self.summarising_agent = LlamaCppAgent(llm_provider, debug_output=True,
16
+ system_prompt="You are a text summarization and information extraction specialist and you are able to summarize and filter out information relevant to a specific query.",
17
+ predefined_messages_formatter_type=message_formatter_type)
18
+ if web_crawler is None:
19
+ self.web_crawler = TrafilaturaWebCrawler()
20
+ else:
21
+ self.web_crawler = web_crawler
22
+
23
+ if web_search_provider is None:
24
+ self.web_search_provider = DDGWebSearchProvider()
25
+ else:
26
+ self.web_search_provider = web_search_provider
27
+
28
+ self.context_character_limit = context_character_limit
29
+
30
+ def search_web(self, search_query: str):
31
+ """
32
+ Search the web for information.
33
+ Args:
34
+ search_query (str): Search query to search for.
35
+ """
36
+ results = self.web_search_provider.search_web(search_query)
37
+ result_string = ''
38
+ for res in results:
39
+ web_info = self.web_crawler.get_website_content_from_url(res)
40
+ if web_info != "":
41
+ web_info = self.summarising_agent.get_chat_response(
42
+ f"Please summarize the following Website content and extract relevant information to this query:'{search_query}'.\n\n" + web_info,
43
+ add_response_to_chat_history=False, add_message_to_chat_history=False)
44
+ result_string += web_info
45
+
46
+ res = result_string.strip()
47
+ return "Based on the following results, answer the previous user query:\nResults:\n\n" + res[:self.context_character_limit]
48
+
49
+ def get_tool(self):
50
+ return self.search_web
51
+
web_search_interfaces.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+
3
+
4
+ class WebCrawler(abc.ABC):
5
+ @abc.abstractmethod
6
+ def get_website_content_from_url(self, url: str):
7
+ """Get the website content from an url."""
8
+ pass
9
+
10
+
11
+ class WebSearchProvider(abc.ABC):
12
+ @abc.abstractmethod
13
+ def search_web(self, query: str):
14
+ """Searches the web and returns a list of urls of the result"""
15
+ pass