Update README.md
Browse files
README.md
CHANGED
@@ -56,47 +56,11 @@ new_height, new_width = smart_resize(height, width, max_pixels=MAX_IMAGE_PIXELS)
|
|
56 |
# Prepare inputs
|
57 |
instruction = "View detailed storage space usage"
|
58 |
|
59 |
-
system_prompt =
|
60 |
-
The
|
61 |
-
tool_prompt = "
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
You may call one or more functions to assist with the user query.
|
66 |
-
|
67 |
-
You are provided with function signatures within <tools></tools> XML tags:
|
68 |
-
<tools>
|
69 |
-
{\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\
|
70 |
-
* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\
|
71 |
-
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\
|
72 |
-
* The screen's resolution is " + str(new_width) + "x" + str(new_height) + ".\
|
73 |
-
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\
|
74 |
-
* `key`: Perform a key event on the mobile device.\
|
75 |
-
- This supports adb's `keyevent` syntax.\
|
76 |
-
- Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\
|
77 |
-
* `click`: Click the point on the screen with coordinate (x, y).\
|
78 |
-
* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\
|
79 |
-
* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\
|
80 |
-
* `type`: Input the specified text into the activated input box.\
|
81 |
-
* `system_button`: Press the system button.\
|
82 |
-
* `open`: Open an app on the device.\
|
83 |
-
* `wait`: Wait specified seconds for the change to happen.\
|
84 |
-
* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}
|
85 |
-
</tools>
|
86 |
-
|
87 |
-
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
88 |
-
<tool_call>
|
89 |
-
{\"name\": <function-name>, \"arguments\": <args-json-object>}
|
90 |
-
</tool_call>"
|
91 |
-
grounding_prompt = f'''The screen's resolution is {new_width}x{new_height}.
|
92 |
-
Point to the UI element most relevant to "{instruction}", output its coordinates using JSON format:
|
93 |
-
```json
|
94 |
-
[
|
95 |
-
{{"point_2d": [x, y], "label": "object name/description"}}
|
96 |
-
]```'''
|
97 |
-
trajectory_prompt = f"The user query: {instruction}
|
98 |
-
Task progress (You have done the following operation on the current device): "
|
99 |
-
|
100 |
|
101 |
# Build messages
|
102 |
grounding_messages = [
|
@@ -110,7 +74,7 @@ grounding_messages = [
|
|
110 |
}
|
111 |
]
|
112 |
trajectory_messages = [
|
113 |
-
{"role": "system", "content": system_prompt + tool_prompt},
|
114 |
{
|
115 |
"role": "user",
|
116 |
"content": [
|
@@ -189,7 +153,8 @@ except:
|
|
189 |
|
190 |
For more information, please refer to our [repo](https://github.com/Reallm-Labs/InfiGUI-R1).
|
191 |
|
192 |
-
|
|
|
193 |
If you find this work useful, we would be grateful if you consider citing the following papers:
|
194 |
```bibtex
|
195 |
@article{liu2025infigui,
|
|
|
56 |
# Prepare inputs
|
57 |
instruction = "View detailed storage space usage"
|
58 |
|
59 |
+
system_prompt = 'You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within <think> </think> tags.'
|
60 |
+
## The following prompts are primarily sourced from https://github.com/QwenLM/Qwen2.5-VL
|
61 |
+
tool_prompt = "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is " + str(new_width) + "x" + str(new_height) + ".\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n - This supports adb's `keyevent` syntax.\\n - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"
|
62 |
+
grounding_prompt = f'The screen\'s resolution is {new_width}x{new_height}.\nPoint to the UI element most relevant to "{instruction}", output its coordinates using JSON format:\n```json\n[\n {{"point_2d": [x, y], "label": "object name/description"}}\n]```'
|
63 |
+
trajectory_prompt = f'The user query: {instruction}\nTask progress (You have done the following operation on the current device): '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# Build messages
|
66 |
grounding_messages = [
|
|
|
74 |
}
|
75 |
]
|
76 |
trajectory_messages = [
|
77 |
+
{"role": "system", "content": system_prompt + "\n\n" + tool_prompt},
|
78 |
{
|
79 |
"role": "user",
|
80 |
"content": [
|
|
|
153 |
|
154 |
For more information, please refer to our [repo](https://github.com/Reallm-Labs/InfiGUI-R1).
|
155 |
|
156 |
+
## Citation Information
|
157 |
+
|
158 |
If you find this work useful, we would be grateful if you consider citing the following papers:
|
159 |
```bibtex
|
160 |
@article{liu2025infigui,
|