plcedoz38 commited on
Commit
d78a7a0
·
1 Parent(s): 5320aee

navigation example

Browse files
Files changed (3) hide show
  1. README.md +32 -90
  2. localization.py +52 -0
  3. navigation.py +186 -0
README.md CHANGED
@@ -78,6 +78,10 @@ benchmark [WebClick](https://huggingface.co/datasets/Hcompany/WebClick).
78
 
79
  ## Get Started with the Model
80
 
 
 
 
 
81
  We provide starter code for the localization task: i.e. image + instruction -> click coordinates
82
 
83
  We also provide code to reproduce screenspot evaluations: screenspot_eval.py
@@ -149,109 +153,47 @@ resized_height, resized_width = smart_resize(
149
  max_pixels=image_processor.max_pixels,
150
  )
151
  image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
152
-
153
- instruction = "Select July 14th as the check-out date"
154
  ```
155
 
156
- ### Localization as click(x, y)
157
 
158
  ```python
159
- def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
160
- guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
161
-
162
- return [
163
- {
164
- "role": "user",
165
- "content": [
166
- {
167
- "type": "image",
168
- "image": image,
169
- },
170
- {"type": "text", "text": f"{guidelines}\n{instruction}"},
171
- ],
172
- }
173
- ]
174
-
175
-
176
- messages = get_localization_prompt(image, instruction)
177
- coordinates_str = run_inference(messages)[0]
178
- print(coordinates_str)
179
- # Expected Click(352, 348)
180
  ```
181
 
182
- ### Structured Output
183
-
184
- We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
185
 
186
  ```python
187
- from pydantic import BaseModel, ConfigDict
188
-
189
- class FunctionDefinition(BaseModel):
190
- """Function definition data structure.
191
-
192
- Attributes:
193
- name: name of the function.
194
- description: description of the function.
195
- parameters: JSON schema for the function parameters.
196
- strict: Whether to enable strict schema adherence when generating the function call.
197
- """
198
-
199
- name: str
200
- description: str = ""
201
- parameters: dict[str, Any] = {}
202
- strict: bool = True
203
-
204
-
205
- class ClickAction(BaseModel):
206
- """Click at specific coordinates on the screen."""
207
 
208
- model_config = ConfigDict(
209
- extra="forbid",
210
- json_schema_serialization_defaults_required=True,
211
- json_schema_mode_override="serialization",
212
- use_attribute_docstrings=True,
213
- )
214
-
215
- action: Literal["click"] = "click"
216
- x: int
217
- """The x coordinate, number of pixels from the left edge."""
218
- y: int
219
- """The y coordinate, number of pixels from the top edge."""
220
 
 
221
 
222
- function_definition = FunctionDefinition(
223
- name="click_action",
224
- description=ClickAction.__doc__ or "",
225
- parameters=ClickAction.model_json_schema(),
226
- strict=True,
227
- )
228
 
 
 
 
229
 
230
- def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
231
- guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
232
-
233
- return [
234
- {
235
- "role": "system",
236
- "content": json.dumps([function_definition.model_dump()]),
237
- },
238
- {
239
- "role": "user",
240
- "content": [
241
- {
242
- "type": "image",
243
- "image": image,
244
- },
245
- {"type": "text", "text": f"{guidelines}\n{instruction}"},
246
- ],
247
- },
248
- ]
249
-
250
-
251
- messages = get_localization_prompt_structured_output(image, instruction)
252
- coordinates_str = run_inference(messages)[0]
253
- coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
254
- print(coordinates)
255
  # Expected ClickAction(action='click', x=352, y=340)
256
  ```
257
 
 
78
 
79
  ## Get Started with the Model
80
 
81
+ We provide 2 spaces to experiment with Localization and Navigation:
82
+ - https://huggingface.co/spaces/Hcompany/Holo1-Navigation
83
+ - https://huggingface.co/spaces/Hcompany/Holo1-Localization
84
+
85
  We provide starter code for the localization task: i.e. image + instruction -> click coordinates
86
 
87
  We also provide code to reproduce screenspot evaluations: screenspot_eval.py
 
153
  max_pixels=image_processor.max_pixels,
154
  )
155
  image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
 
 
156
  ```
157
 
158
+ ### Navigation with Structured Output
159
 
160
  ```python
161
+ import json
162
+ from . import navigation
163
+
164
+ task = "Book a hotel in Paris on August 3rd for 3 nights"
165
+ prompt = navigation.get_navigation_prompt(task, image, step=1)
166
+ navigation_str = run_inference(prompt)[0]
167
+ navigation = NavigationStep(**json.loads(navigation_str))
168
+ print(navigation)
169
+ # Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
 
 
 
 
 
 
 
 
 
 
 
 
170
  ```
171
 
172
+ ### Localization with click(x, y)
 
 
173
 
174
  ```python
175
+ from . import localization
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ instruction = "Select July 14th as the check-out date"
178
+ prompt = localization.get_localization_prompt(image, instruction)
179
+ coordinates = run_inference(prompt)[0]
180
+ print(coordinates)
181
+ # Expected Click(352, 348)
182
+ ```
 
 
 
 
 
 
183
 
184
+ ### Localization with Structured Output
185
 
186
+ We trained Holo1 as an Action VLM with extensive use of json and tool calls. Therefore, it can be queried reliably with structured output:
 
 
 
 
 
187
 
188
+ ```python
189
+ import json
190
+ from . import localization
191
 
192
+ instruction = "Select July 14th as the check-out date"
193
+ prompt = localization.get_localization_prompt_structured_output(image, instruction)
194
+ coordinates_structured_str = run_inference(prompt)[0]
195
+ coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
196
+ print(coordinates_structured)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  # Expected ClickAction(action='click', x=352, y=340)
198
  ```
199
 
localization.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Any, Literal
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
8
+ guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
9
+
10
+ return [
11
+ {
12
+ "role": "user",
13
+ "content": [
14
+ {
15
+ "type": "image",
16
+ "image": image,
17
+ },
18
+ {"type": "text", "text": f"{guidelines}\n{instruction}"},
19
+ ],
20
+ }
21
+ ]
22
+
23
+
24
+ class ClickAction(BaseModel):
25
+ """Click at specific coordinates on the screen."""
26
+
27
+ action: Literal["click"] = "click"
28
+ x: int
29
+ """The x coordinate, number of pixels from the left edge."""
30
+ y: int
31
+ """The y coordinate, number of pixels from the top edge."""
32
+
33
+
34
+ def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
35
+ guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
36
+
37
+ return [
38
+ {
39
+ "role": "system",
40
+ "content": json.dumps([ClickAction.model_json_schema()]),
41
+ },
42
+ {
43
+ "role": "user",
44
+ "content": [
45
+ {
46
+ "type": "image",
47
+ "image": image,
48
+ },
49
+ {"type": "text", "text": f"{guidelines}\n{instruction}"},
50
+ ],
51
+ },
52
+ ]
navigation.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ SYSTEM_PROMPT: str = """Imagine you are a robot browsing the web, just like humans. Now you need to complete a task.
6
+ In each iteration, you will receive an Observation that includes the last screenshots of a web browser and the current memory of the agent.
7
+ You have also information about the step that the agent is trying to achieve to solve the task.
8
+ Carefully analyze the visual information to identify what to do, then follow the guidelines to choose the following action.
9
+ You should detail your thought (i.e. reasoning steps) before taking the action.
10
+ Also detail in the notes field of the action the extracted information relevant to solve the task.
11
+ Once you have enough information in the notes to answer the task, return an answer action with the detailed answer in the notes field.
12
+ This will be evaluated by an evaluator and should match all the criteria or requirements of the task.
13
+
14
+ Guidelines:
15
+ - store in the notes all the relevant information to solve the task that fulfill the task criteria. Be precise
16
+ - Use both the task and the step information to decide what to do
17
+ - if you want to write in a text field and the text field already has text, designate the text field by the text it contains and its type
18
+ - If there is a cookies notice, always accept all the cookies first
19
+ - The observation is the screenshot of the current page and the memory of the agent.
20
+ - If you see relevant information on the screenshot to answer the task, add it to the notes field of the action.
21
+ - If there is no relevant information on the screenshot to answer the task, add an empty string to the notes field of the action.
22
+ - If you see buttons that allow to navigate directly to relevant information, like jump to ... or go to ... , use them to navigate faster.
23
+ - In the answer action, give as many details a possible relevant to answering the task.
24
+ - if you want to write, don't click before. Directly use the write action
25
+ - to write, identify the web element which is type and the text it already contains
26
+ - If you want to use a search bar, directly write text in the search bar
27
+ - Don't scroll too much. Don't scroll if the number of scrolls is greater than 3
28
+ - Don't scroll if you are at the end of the webpage
29
+ - Only refresh if you identify a rate limit problem
30
+ - If you are looking for a single flights, click on round-trip to select 'one way'
31
+ - Never try to login, enter email or password. If there is a need to login, then go back.
32
+ - If you are facing a captcha on a website, try to solve it.
33
+
34
+ - if you have enough information in the screenshot and in the notes to answer the task, return an answer action with the detailed answer in the notes field
35
+ - The current date is {timestamp}.
36
+
37
+ # <output_json_format>
38
+ # ```json
39
+ # {output_format}
40
+ # ```
41
+ # </output_json_format>
42
+
43
+ """
44
+
45
+
46
+ class ClickElementAction(BaseModel):
47
+ """Click at absolute coordinates of a web element with its description"""
48
+
49
+ action: Literal["click_element"] = Field(description="Click at absolute coordinates of a web element")
50
+ element: str = Field(description="text description of the element")
51
+ x: int = Field(description="The x coordinate, number of pixels from the left edge.")
52
+ y: int = Field(description="The y coordinate, number of pixels from the top edge.")
53
+
54
+ def log(self):
55
+ return f"I have clicked on the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
56
+
57
+
58
+ class WriteElementAction(BaseModel):
59
+ """Write content at absolute coordinates of a web element identified by its description, then press Enter."""
60
+
61
+ action: Literal["write_element_abs"] = Field(description="Write content at absolute coordinates of a web page")
62
+ content: str = Field(description="Content to write")
63
+ element: str = Field(description="Text description of the element")
64
+ x: int = Field(description="The x coordinate, number of pixels from the left edge.")
65
+ y: int = Field(description="The y coordinate, number of pixels from the top edge.")
66
+
67
+ def log(self):
68
+ return f"I have written '{self.content}' in the element '{self.element}' at absolute coordinates {self.x}, {self.y}"
69
+
70
+
71
+ class ScrollAction(BaseModel):
72
+ """Scroll action with no required element"""
73
+
74
+ action: Literal["scroll"] = Field(description="Scroll the page or a specific element")
75
+ direction: Literal["down", "up", "left", "right"] = Field(description="The direction to scroll in")
76
+
77
+ def log(self):
78
+ return f"I have scrolled {self.direction}"
79
+
80
+
81
+ class GoBackAction(BaseModel):
82
+ """Action to navigate back in browser history"""
83
+
84
+ action: Literal["go_back"] = Field(description="Navigate to the previous page")
85
+
86
+ def log(self):
87
+ return "I have gone back to the previous page"
88
+
89
+
90
+ class RefreshAction(BaseModel):
91
+ """Action to refresh the current page"""
92
+
93
+ action: Literal["refresh"] = Field(description="Refresh the current page")
94
+
95
+ def log(self):
96
+ return "I have refreshed the page"
97
+
98
+
99
+ class GotoAction(BaseModel):
100
+ """Action to go to a particular URL"""
101
+
102
+ action: Literal["goto"] = Field(description="Goto a particular URL")
103
+ url: str = Field(description="A url starting with http:// or https://")
104
+
105
+ def log(self):
106
+ return f"I have navigated to the URL {self.url}"
107
+
108
+
109
+ class WaitAction(BaseModel):
110
+ """Action to wait for a particular amount of time"""
111
+
112
+ action: Literal["wait"] = Field(description="Wait for a particular amount of time")
113
+ seconds: int = Field(default=2, ge=0, le=10, description="The number of seconds to wait")
114
+
115
+ def log(self):
116
+ return f"I have waited for {self.seconds} seconds"
117
+
118
+
119
+ class RestartAction(BaseModel):
120
+ """Restart the task from the beginning."""
121
+
122
+ action: Literal["restart"] = "restart"
123
+
124
+ def log(self):
125
+ return "I have restarted the task from the beginning"
126
+
127
+
128
+ class AnswerAction(BaseModel):
129
+ """Return a final answer to the task. This is the last action to call in an episode."""
130
+
131
+ action: Literal["answer"] = "answer"
132
+ content: str = Field(description="The answer content")
133
+
134
+ def log(self):
135
+ return f"I have answered the task with '{self.content}'"
136
+
137
+
138
+ ActionSpace = (
139
+ ClickElementAction
140
+ | WriteElementAction
141
+ | ScrollAction
142
+ | GoBackAction
143
+ | RefreshAction
144
+ | WaitAction
145
+ | RestartAction
146
+ | AnswerAction
147
+ | GotoAction
148
+ )
149
+
150
+
151
+ class NavigationStep(BaseModel):
152
+ note: str = Field(
153
+ default="",
154
+ description="Task-relevant information extracted from the previous observation. Keep empty if no new info.",
155
+ )
156
+ thought: str = Field(description="Reasoning about next steps (<4 lines)")
157
+ action: ActionSpace = Field(description="Next action to take")
158
+
159
+
160
+ def get_navigation_prompt(task, image, step=1):
161
+ system_prompt = SYSTEM_PROMPT.format(
162
+ output_format=NavigationStep.model_json_schema(),
163
+ timestamp="2025-06-04 14:16:03",
164
+ )
165
+ return [
166
+ {
167
+ "role": "system",
168
+ "content": [
169
+ {"type": "text", "text": system_prompt},
170
+ ],
171
+ },
172
+ {
173
+ "role": "user",
174
+ "content": [
175
+ {"type": "text", "text": f"<task>\n{task}\n</task>\n"},
176
+ {"type": "text", "text": f"<observation step={step}>\n"},
177
+ {"type": "text", "text": "<screenshot>\n"},
178
+ {
179
+ "type": "image",
180
+ "image": image,
181
+ },
182
+ {"type": "text", "text": "\n</screenshot>\n"},
183
+ {"type": "text", "text": "\n</observation>\n"},
184
+ ],
185
+ },
186
+ ]