sajmahmo commited on
Commit
eb8898c
·
unverified ·
1 Parent(s): 962e916

vision browser agent

Browse files
src/party_planner/tools/browser.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from smolagents import tool
2
+ from selenium import webdriver
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.common.keys import Keys
5
+
6
+
7
+ @tool
8
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
9
+ """
10
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
11
+ Args:
12
+ text: The text to search for
13
+ nth_result: Which occurrence to jump to (default: 1)
14
+ """
15
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
16
+ if nth_result > len(elements):
17
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
18
+ result = f"Found {len(elements)} matches for '{text}'."
19
+ elem = elements[nth_result - 1]
20
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
21
+ result += f"Focused on element {nth_result} of {len(elements)}"
22
+ return result
23
+
24
+
25
+ @tool
26
+ def go_back() -> None:
27
+ """Goes back to previous page."""
28
+ driver.back()
29
+
30
+
31
+ @tool
32
+ def close_popups() -> str:
33
+ """
34
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
35
+ """
36
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
src/party_planner/vision_browser_agent.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import helium
4
+ from dotenv import load_dotenv
5
+ from smolagents import CodeAgent, DuckDuckGoSearchTool
6
+ from smolagents.cli import load_model
7
+
8
+ from src.party_planner.tools.browser import *
9
+ from src.party_planner.utils import save_screenshot
10
+
11
+
12
+ alfred_guest_list_request = """
13
+ I am Alfred, the butler of Wayne Manor, responsible for verifying the identity of guests at party. A superhero has arrived at the entrance claiming to be Wonder Woman, but I need to confirm if she is who she says she is.
14
+ Please search for images of Wonder Woman and generate a detailed visual description based on those images. Additionally, navigate to Wikipedia to gather key details about her appearance. With this information, I can determine whether to grant her access to the event.
15
+ """
16
+
17
+
18
+ def parse_arguments():
19
+ parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
20
+ parser.add_argument(
21
+ "prompt",
22
+ type=str,
23
+ nargs="?", # Makes it optional
24
+ default=alfred_guest_list_request,
25
+ help="The prompt to run with the agent",
26
+ )
27
+ parser.add_argument(
28
+ "--model-type",
29
+ type=str,
30
+ default="LiteLLMModel",
31
+ help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
32
+ )
33
+ parser.add_argument(
34
+ "--model-id",
35
+ type=str,
36
+ default="gpt-4o",
37
+ help="The model ID to use for the specified model type",
38
+ )
39
+ return parser.parse_args()
40
+
41
+
42
+ def initialize_driver():
43
+ """Initialize the Selenium WebDriver."""
44
+ chrome_options = webdriver.ChromeOptions()
45
+ chrome_options.add_argument("--force-device-scale-factor=1")
46
+ chrome_options.add_argument("--window-size=1000,1350")
47
+ chrome_options.add_argument("--disable-pdf-viewer")
48
+ chrome_options.add_argument("--window-position=0,0")
49
+ return helium.start_chrome(headless=False, options=chrome_options)
50
+
51
+
52
+ def initialize_agent(model):
53
+ """Initialize the CodeAgent with the specified model."""
54
+ return CodeAgent(
55
+ tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
56
+ model=model,
57
+ additional_authorized_imports=["helium"],
58
+ step_callbacks=[save_screenshot],
59
+ max_steps=20,
60
+ verbosity_level=2,
61
+ )
62
+
63
+
64
+ helium_instructions = """
65
+ Use your web_search tool when you want to get Google search results.
66
+ Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
67
+ Don't bother about the helium driver, it's already managed.
68
+ We've already ran "from helium import *"
69
+ Then you can go to pages!
70
+ Code:
71
+ ```py
72
+ go_to('github.com/trending')
73
+ ```<end_code>
74
+ You can directly click clickable elements by inputting the text that appears on them.
75
+ Code:
76
+ ```py
77
+ click("Top products")
78
+ ```<end_code>
79
+ If it's a link:
80
+ Code:
81
+ ```py
82
+ click(Link("Top products"))
83
+ ```<end_code>
84
+ If you try to interact with an element and it's not found, you'll get a LookupError.
85
+ In general stop your action after each button click to see what happens on your screenshot.
86
+ Never try to login in a page.
87
+ To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
88
+ Code:
89
+ ```py
90
+ scroll_down(num_pixels=1200) # This will scroll one viewport down
91
+ ```<end_code>
92
+ When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
93
+ Just use your built-in tool `close_popups` to close them:
94
+ Code:
95
+ ```py
96
+ close_popups()
97
+ ```<end_code>
98
+ You can use .exists() to check for the existence of an element. For example:
99
+ Code:
100
+ ```py
101
+ if Text('Accept cookies?').exists():
102
+ click('I accept')
103
+ ```<end_code>
104
+ Proceed in several steps rather than trying to solve the task in one shot.
105
+ And at the end, only when you have your answer, return your final answer.
106
+ Code:
107
+ ```py
108
+ final_answer("YOUR_ANSWER_HERE")
109
+ ```<end_code>
110
+ If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
111
+ To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
112
+ Of course, you can act on buttons like a user would do when navigating.
113
+ After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
114
+ But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
115
+ Don't kill the browser.
116
+ When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
117
+ """
118
+
119
+
120
+ def main():
121
+ load_dotenv()
122
+ args = parse_arguments()
123
+
124
+ # Initialize the model based on the provided arguments
125
+ model = load_model(args.model_type, args.model_id)
126
+
127
+ global driver
128
+ driver = initialize_driver()
129
+ agent = initialize_agent(model)
130
+
131
+ # Run the agent with the provided prompt
132
+ agent.python_executor("from helium import *", agent.state)
133
+ agent.run(args.prompt + helium_instructions)
134
+
135
+
136
+ if __name__ == "__main__":
137
+ main()