Add pipeline tag and library name

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +46 -8
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- license: apache-2.0
3
- language:
4
- - en
5
  base_model:
6
  - Qwen/Qwen2.5-VL-3B-Instruct
 
 
 
7
  tags:
8
  - gui
9
  - agent
 
 
10
  ---
11
 
12
  # InfiGUI-R1-3B
@@ -54,10 +56,46 @@ new_height, new_width = smart_resize(height, width, max_pixels=MAX_IMAGE_PIXELS)
54
  # Prepare inputs
55
  instruction = "View detailed storage space usage"
56
 
57
- system_prompt = "You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within <think> </think> tags."
58
- tool_prompt = "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n{\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\\n* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\\n* The screen's resolution is " + str(new_width) + "x" + str(new_height) + ".\\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\\n* `key`: Perform a key event on the mobile device.\\n - This supports adb's `keyevent` syntax.\\n - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\\n* `click`: Click the point on the screen with coordinate (x, y).\\n* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\\n* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\\n* `type`: Input the specified text into the activated input box.\\n* `system_button`: Press the system button.\\n* `open`: Open an app on the device.\\n* `wait`: Wait specified seconds for the change to happen.\\n* `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>"
59
- grounding_prompt = f'''The screen's resolution is {new_width}x{new_height}.\nPoint to the UI element most relevant to "{instruction}", output its coordinates using JSON format:\n```json\n[\n {{"point_2d": [x, y], "label": "object name/description"}}\n]```'''
60
- trajectory_prompt = f"The user query: {instruction}\nTask progress (You have done the following operation on the current device): "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  # Build messages
@@ -149,4 +187,4 @@ except:
149
  print("Error: Failed to parse coordinates or process image")
150
  ```
151
 
152
- For more information, please refer to our [repo](https://github.com/Reallm-Labs/InfiGUI-R1).
 
1
  ---
 
 
 
2
  base_model:
3
  - Qwen/Qwen2.5-VL-3B-Instruct
4
+ language:
5
+ - en
6
+ license: apache-2.0
7
  tags:
8
  - gui
9
  - agent
10
+ pipeline_tag: image-text-to-text
11
+ library_name: transformers
12
  ---
13
 
14
  # InfiGUI-R1-3B
 
56
  # Prepare inputs
57
  instruction = "View detailed storage space usage"
58
 
59
+ system_prompt = "You FIRST think about the reasoning process as an internal monologue and then provide the final answer.
60
+ The reasoning process MUST BE enclosed within <think> </think> tags."
61
+ tool_prompt = "
62
+
63
+ # Tools
64
+
65
+ You may call one or more functions to assist with the user query.
66
+
67
+ You are provided with function signatures within <tools></tools> XML tags:
68
+ <tools>
69
+ {\"type\": \"function\", \"function\": {\"name\": \"mobile_use\", \"description\": \"Use a touchscreen to interact with a mobile device, and take screenshots.\
70
+ * This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.\
71
+ * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.\
72
+ * The screen's resolution is " + str(new_width) + "x" + str(new_height) + ".\
73
+ * Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.\", \"parameters\": {\"properties\": {\"action\": {\"description\": \"The action to perform. The available actions are:\
74
+ * `key`: Perform a key event on the mobile device.\
75
+ - This supports adb's `keyevent` syntax.\
76
+ - Examples: \\\"volume_up\\\", \\\"volume_down\\\", \\\"power\\\", \\\"camera\\\", \\\"clear\\\".\
77
+ * `click`: Click the point on the screen with coordinate (x, y).\
78
+ * `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.\
79
+ * `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).\
80
+ * `type`: Input the specified text into the activated input box.\
81
+ * `system_button`: Press the system button.\
82
+ * `open`: Open an app on the device.\
83
+ * `wait`: Wait specified seconds for the change to happen.\
84
+ * `terminate`: Terminate the current task and report its completion status.\", \"enum\": [\"key\", \"click\", \"long_press\", \"swipe\", \"type\", \"system_button\", \"open\", \"wait\", \"terminate\"], \"type\": \"string\"}, \"coordinate\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.\", \"type\": \"array\"}, \"coordinate2\": {\"description\": \"(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.\", \"type\": \"array\"}, \"text\": {\"description\": \"Required only by `action=key`, `action=type`, and `action=open`.\", \"type\": \"string\"}, \"time\": {\"description\": \"The seconds to wait. Required only by `action=long_press` and `action=wait`.\", \"type\": \"number\"}, \"button\": {\"description\": \"Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`\", \"enum\": [\"Back\", \"Home\", \"Menu\", \"Enter\"], \"type\": \"string\"}, \"status\": {\"description\": \"The status of the task. Required only by `action=terminate`.\", \"type\": \"string\", \"enum\": [\"success\", \"failure\"]}}, \"required\": [\"action\"], \"type\": \"object\"}}}
85
+ </tools>
86
+
87
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
88
+ <tool_call>
89
+ {\"name\": <function-name>, \"arguments\": <args-json-object>}
90
+ </tool_call>"
91
+ grounding_prompt = f'''The screen's resolution is {new_width}x{new_height}.
92
+ Point to the UI element most relevant to "{instruction}", output its coordinates using JSON format:
93
+ ```json
94
+ [
95
+ {{"point_2d": [x, y], "label": "object name/description"}}
96
+ ]```'''
97
+ trajectory_prompt = f"The user query: {instruction}
98
+ Task progress (You have done the following operation on the current device): "
99
 
100
 
101
  # Build messages
 
187
  print("Error: Failed to parse coordinates or process image")
188
  ```
189
 
190
+ For more information, please refer to our [repo](https://github.com/Reallm-Labs/InfiGUI-R1).