MingComplex commited on
Commit
ad1e2c5
·
1 Parent(s): 13ffe23
.gitattributes CHANGED
@@ -33,3 +33,22 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/amazon.jpg filter=lfs diff=lfs merge=lfs -text
37
+ examples/arxiv.jpg filter=lfs diff=lfs merge=lfs -text
38
+ examples/health.jpg filter=lfs diff=lfs merge=lfs -text
39
+ examples/semantic.jpg filter=lfs diff=lfs merge=lfs -text
40
+ examples/accweather.jpg filter=lfs diff=lfs merge=lfs -text
41
+ examples/apple_music.png filter=lfs diff=lfs merge=lfs -text
42
+ examples/safari_google.png filter=lfs diff=lfs merge=lfs -text
43
+ examples/weather_ui.png filter=lfs diff=lfs merge=lfs -text
44
+ examples/word.png filter=lfs diff=lfs merge=lfs -text
45
+ examples/ios_setting.png filter=lfs diff=lfs merge=lfs -text
46
+ examples/map.png filter=lfs diff=lfs merge=lfs -text
47
+ examples/paint_3d.png filter=lfs diff=lfs merge=lfs -text
48
+ examples/web_shopping.png filter=lfs diff=lfs merge=lfs -text
49
+ examples/finder.png filter=lfs diff=lfs merge=lfs -text
50
+ examples/football_live.png filter=lfs diff=lfs merge=lfs -text
51
+ examples/solitaire.png filter=lfs diff=lfs merge=lfs -text
52
+ examples/wallet.png filter=lfs diff=lfs merge=lfs -text
53
+ examples/web_forum.png filter=lfs diff=lfs merge=lfs -text
54
+ examples/windows_panel.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: UI TARS
3
- emoji: 👀
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.13.1
8
  app_file: app.py
9
  pinned: false
10
- license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: UI-TARS
3
+ emoji: 🌖
4
+ colorFrom: blue
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
+ license: other
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import ast
4
+ import os
5
+ import re
6
+ import io
7
+ import math
8
+ import gradio as gr
9
+ import oss2
10
+ from oss2.credentials import EnvironmentVariableCredentialsProvider
11
+ from openai import OpenAI
12
+ from datetime import datetime
13
+ from PIL import ImageDraw
14
+
15
+
16
+ # Define constants
17
+ DESCRIPTION = "[UI-TARS](https://github.com/bytedance/UI-TARS)"
18
+ client = OpenAI(
19
+ base_url=os.environ.get("ENDPOINT_URL"),
20
+ api_key=os.environ.get("API_KEY")
21
+ )
22
+
23
+
24
+ prompt = "Output only the coordinate of one box in your response. "
25
+ auth = oss2.ProviderAuthV4(EnvironmentVariableCredentialsProvider())
26
+ endpoint = 'oss-us-east-1.aliyuncs.com'
27
+ region = "us-east-1"
28
+ bucket = os.environ.get("BUCKET")
29
+ bucket = oss2.Bucket(auth, endpoint, bucket, region=region)
30
+
31
+
32
+ def draw_point_area(image, point):
33
+ radius = min(image.width, image.height) // 15
34
+ x, y = round(point[0]/1000 * image.width), round(point[1]/1000 * image.height)
35
+ ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), outline='red', width=2)
36
+ ImageDraw.Draw(image).ellipse((x - 2, y - 2, x + 2, y + 2), fill='red')
37
+ return image
38
+
39
+
40
+ def resize_image(image):
41
+ max_pixels = 6000 * 28 * 28
42
+ if image.width * image.height > max_pixels:
43
+ max_pixels = 2700 * 28 * 28
44
+ else:
45
+ max_pixels = 1340 * 28 * 28
46
+ resize_factor = math.sqrt(max_pixels / (image.width * image.height))
47
+ width, height = int(image.width * resize_factor), int(image.height * resize_factor)
48
+ image = image.resize((width, height))
49
+ return image
50
+
51
+
52
+ def upload_images(session_id, image, result_image, query):
53
+ img_path = f"{session_id}.png"
54
+ result_img_path = f"{session_id}-draw.png"
55
+ metadata = dict(
56
+ query=query,
57
+ resize_image=img_path,
58
+ result_image=result_img_path,
59
+ session_id=session_id
60
+ )
61
+ img_bytes = io.BytesIO()
62
+ image.save(img_bytes, format="png")
63
+ img_bytes = img_bytes.getvalue()
64
+ bucket.put_object(img_path, img_bytes)
65
+
66
+ rst_img_bytes = io.BytesIO()
67
+ result_image.save(rst_img_bytes, format="png")
68
+ rst_img_bytes = rst_img_bytes.getvalue()
69
+ bucket.put_object(result_img_path, rst_img_bytes)
70
+ bucket.put_object(f"{session_id}.json", json.dumps(metadata))
71
+ print("end upload images")
72
+
73
+
74
+ def run_ui(image, query, session_id, is_example_image):
75
+ click_xy = None
76
+ images_during_iterations = [] # List to store images at each step
77
+ width, height = image.width, image.height
78
+ image = resize_image(image)
79
+ bytes = io.BytesIO()
80
+ image.save(bytes, format="png")
81
+ base64_image = base64.standard_b64encode(bytes.getvalue()).decode("utf-8")
82
+ messages = [
83
+ {
84
+ "role": "user",
85
+ "content": [
86
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}},
87
+ {"type": "text", "text": prompt + query},
88
+ ],
89
+ }
90
+ ]
91
+ response = client.chat.completions.create(model="tgi", messages=messages, temperature=1.0, top_p=0.7, max_tokens=128, frequency_penalty=1, stream=False)
92
+ output_text = response.choices[0].message.content
93
+ pattern = r"\((\d+,\d+)\)"
94
+ match = re.search(pattern, output_text)
95
+ if match:
96
+ coordinates = match.group(1)
97
+ click_xy = ast.literal_eval(coordinates)
98
+ result_image = draw_point_area(image, click_xy)
99
+ images_during_iterations.append(result_image)
100
+ click_xy = round(click_xy[0]/1000 * width), round(click_xy[1]/1000 * height)
101
+ # TODO: async
102
+ if is_example_image == "False":
103
+ upload_images(session_id, image, result_image, query)
104
+
105
+ return images_during_iterations, str(click_xy)
106
+
107
+
108
+ def update_vote(vote_type, image, click_image, prompt, is_example):
109
+ """upload bad cases to somewhere"""
110
+ if vote_type == "upvote":
111
+ return "Everything good"
112
+
113
+ if is_example == "True":
114
+ return "Do nothing for example"
115
+ click_img_path = click_image[0] # webp format
116
+ image.size
117
+ # TODO: upload to some where
118
+ return f"Thank you for your feedback!"
119
+
120
+
121
+ examples = [
122
+ ["./examples/solitaire.png", "Play the solitaire collection", True],
123
+ ["./examples/weather_ui.png", "Open map", True],
124
+ ["./examples/football_live.png", "click team 1 win", True],
125
+ ["./examples/windows_panel.png", "switch to documents", True],
126
+ ["./examples/paint_3d.png", "rotate left", True],
127
+ ["./examples/finder.png", "view files from airdrop", True],
128
+ ["./examples/amazon.jpg", "Search bar at the top of the page", True],
129
+ ["./examples/semantic.jpg", "Home", True],
130
+ ["./examples/accweather.jpg", "Select May", True],
131
+ ["./examples/arxiv.jpg", "Home", True],
132
+ ["./examples/health.jpg", "text labeled by 2023/11/26", True],
133
+ ["./examples/ios_setting.png", "Turn off Do not disturb.", True],
134
+ ]
135
+
136
+
137
+
138
+ title_markdown = ("""
139
+ # UI-TARS Pioneering Automated GUI Interaction with Native Agents
140
+ [[🤗Model](https://huggingface.co/bytedance-research/UI-TARS-7B-SFT)] [[⌨️Code](https://github.com/bytedance/UI-TARS)] [[📑Paper](https://github.com/bytedance/UI-TARS/blob/main/UI_TARS_paper.pdf)] [🏄[Midscene (Browser Automation)](https://github.com/web-infra-dev/Midscene)] [🫨[Discord](https://discord.gg/txAE43ps)]
141
+ """)
142
+
143
+
144
+ tos_markdown = ("""
145
+ ### Terms of use
146
+ This demo is governed by the original license of UI-TARS. We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, including hate speech, violence, pornography, deception, etc. (注:本演示受UI-TARS的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)
147
+ """)
148
+
149
+
150
+ learn_more_markdown = ("""
151
+ ### License
152
+ Apache License 2.0
153
+ """)
154
+
155
+
156
+ code_adapt_markdown = ("""
157
+ ### Acknowledgments
158
+ The app code is modified from [ShowUI](https://huggingface.co/spaces/showlab/ShowUI)
159
+ """)
160
+
161
+
162
+ block_css = """
163
+ #buttons button {
164
+ min-width: min(120px,100%);
165
+ }
166
+
167
+ #chatbot img {
168
+ max-width: 80%;
169
+ max-height: 80vh;
170
+ width: auto;
171
+ height: auto;
172
+ object-fit: contain;
173
+ }
174
+ """
175
+
176
+
177
+ def build_demo():
178
+ with gr.Blocks(title="UI-TARS Demo", theme=gr.themes.Default(), css=block_css) as demo:
179
+ state_session_id = gr.State(value=None)
180
+ gr.Markdown(title_markdown)
181
+
182
+
183
+ with gr.Row():
184
+ with gr.Column(scale=3):
185
+ imagebox = gr.Image(type="pil", label="Input Screenshot")
186
+
187
+ textbox = gr.Textbox(
188
+ show_label=True,
189
+ placeholder="Enter an instruction and press Submit",
190
+ label="Instruction",
191
+ )
192
+ submit_btn = gr.Button(value="Submit", variant="primary")
193
+
194
+ with gr.Column(scale=6):
195
+ output_gallery = gr.Gallery(label="Output with click", object_fit="contain", preview=True)
196
+ # output_gallery = gr.Gallery(label="Iterative Refinement")
197
+ gr.HTML(
198
+ """
199
+ <p><strong>Notice:</strong> The <span style="color: red;">red point</span> with a circle on the output image represents the predicted coordinates for a click.</p>
200
+ """
201
+ )
202
+ with gr.Row():
203
+ output_coords = gr.Textbox(label="Final Coordinates")
204
+ image_size = gr.Textbox(label="Image Size")
205
+
206
+ gr.HTML(
207
+ """
208
+ <p><strong>Expected result or not? help us improve! ⬇️</strong></p>
209
+ """
210
+ )
211
+ with gr.Row(elem_id="action-buttons", equal_height=True):
212
+ upvote_btn = gr.Button(value="👍 Looks good!", variant="secondary")
213
+ downvote_btn = gr.Button(value="👎 Wrong coordinates!", variant="secondary")
214
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=True)
215
+
216
+ with gr.Column(scale=3):
217
+ gr.Examples(
218
+ examples=[[e[0], e[1]] for e in examples],
219
+ inputs=[imagebox, textbox],
220
+ outputs=[textbox], # Only update the query textbox
221
+ examples_per_page=3,
222
+ )
223
+
224
+ is_example_dropdown = gr.Dropdown(
225
+ choices=["True", "False"],
226
+ value="False",
227
+ visible=False,
228
+ label="Is Example Image",
229
+ )
230
+
231
+ def set_is_example(query):
232
+ for _, example_query, is_example in examples:
233
+ if query.strip() == example_query.strip():
234
+ return str(is_example) # Return as string for Dropdown compatibility
235
+ return "False"
236
+
237
+ textbox.change(
238
+ set_is_example,
239
+ inputs=[textbox],
240
+ outputs=[is_example_dropdown],
241
+ )
242
+
243
+ def on_submit(image, query, is_example_image):
244
+ if image is None:
245
+ raise ValueError("No image provided. Please upload an image before submitting.")
246
+
247
+ session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
248
+ images_during_iterations, click_coords = run_ui(image, query, session_id, is_example_image)
249
+ return images_during_iterations, click_coords, session_id, f"{image.width}x{image.height}"
250
+
251
+ submit_btn.click(
252
+ on_submit,
253
+ [imagebox, textbox, is_example_dropdown],
254
+ [output_gallery, output_coords, state_session_id, image_size],
255
+ )
256
+
257
+ clear_btn.click(
258
+ lambda: (None, None, None, None, None, None),
259
+ inputs=None,
260
+ outputs=[imagebox, textbox, output_gallery, output_coords, state_session_id, image_size],
261
+ queue=False
262
+ )
263
+
264
+ upvote_btn.click(
265
+ lambda image, click_image, prompt, is_example: update_vote("upvote", image, click_image, prompt, is_example),
266
+ inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
267
+ outputs=[],
268
+ queue=False
269
+ )
270
+
271
+ downvote_btn.click(
272
+ lambda image, click_image, prompt, is_example: update_vote("downvote", image, click_image, prompt, is_example),
273
+ inputs=[imagebox, output_gallery, textbox, is_example_dropdown],
274
+ outputs=[],
275
+ queue=False
276
+ )
277
+
278
+ gr.Markdown(tos_markdown)
279
+ gr.Markdown(learn_more_markdown)
280
+ gr.Markdown(code_adapt_markdown)
281
+
282
+ return demo
283
+
284
+ if __name__ == "__main__":
285
+ demo = build_demo()
286
+ demo.queue(api_open=False).launch(
287
+ server_name="0.0.0.0",
288
+ server_port=7860,
289
+ debug=True,
290
+ )
examples/accweather.jpg ADDED

Git LFS Details

  • SHA256: 52e80b7d563957b1b0f515874dca5e98ad66bd4a7850998abd24a99b728a3a02
  • Pointer size: 131 Bytes
  • Size of remote file: 603 kB
examples/amazon.jpg ADDED

Git LFS Details

  • SHA256: c36e843c7098139eb0f24c11f1e7be76bc8ef4a5f9d5f54fb7dde799c7f92a38
  • Pointer size: 132 Bytes
  • Size of remote file: 2.31 MB
examples/apple_music.png ADDED

Git LFS Details

  • SHA256: 8f0d0737faf8bddbac64333089df87a0e63550f88ae2060d260c3114110362a8
  • Pointer size: 131 Bytes
  • Size of remote file: 974 kB
examples/arxiv.jpg ADDED

Git LFS Details

  • SHA256: 49e6dee4b83b9d9b9c54a2a3744251284e62fe70bb09d3569fc3980550d246b2
  • Pointer size: 131 Bytes
  • Size of remote file: 540 kB
examples/finder.png ADDED

Git LFS Details

  • SHA256: 98589041a1e0766373ba4da84ddaf917de82c1c0c60c7de2a47f1dcf4151f69c
  • Pointer size: 131 Bytes
  • Size of remote file: 387 kB
examples/football_live.png ADDED

Git LFS Details

  • SHA256: 09fe6eaf08a2485ea56d9dd4c722915e159114f45e2f57c57b4aee36d18c762a
  • Pointer size: 131 Bytes
  • Size of remote file: 323 kB
examples/health.jpg ADDED

Git LFS Details

  • SHA256: 188378981c221dac87cd6e020bb88543dd518519b79e8d836656d5baf96b0be5
  • Pointer size: 132 Bytes
  • Size of remote file: 1.07 MB
examples/ios_setting.png ADDED

Git LFS Details

  • SHA256: f4b78bb0bde49ffe175abf4731486c1113d2b7a8009bdd1b7457123793e9e1e6
  • Pointer size: 133 Bytes
  • Size of remote file: 10.3 MB
examples/map.png ADDED

Git LFS Details

  • SHA256: c583dc0ad08f8ec3267425e1a1cd8e2b797b03481396651903debaff2cc59b38
  • Pointer size: 132 Bytes
  • Size of remote file: 5.36 MB
examples/paint_3d.png ADDED

Git LFS Details

  • SHA256: bf9fdd2776a0a98eb92a7243b9cc90dc2759357c1f61f1c0f55912b127e4b430
  • Pointer size: 131 Bytes
  • Size of remote file: 222 kB
examples/safari_google.png ADDED

Git LFS Details

  • SHA256: 5b3421df3f57f1154bb85b3a458eabcc6138926e3dcaa812fd557c4cce46b971
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
examples/semantic.jpg ADDED

Git LFS Details

  • SHA256: 4fe64c2fe0dcfd6722416b383d2e5ad8e26ec231090cb42d2b56149bc6d0c61f
  • Pointer size: 131 Bytes
  • Size of remote file: 596 kB
examples/solitaire.png ADDED

Git LFS Details

  • SHA256: 2dada3345d5efbcd8b0156a56f6337ed50939eae7de3852c859c943227469db7
  • Pointer size: 131 Bytes
  • Size of remote file: 786 kB
examples/wallet.png ADDED

Git LFS Details

  • SHA256: b5bea1d3ee501f4d04058208a6e05f4025f62b1bd99ab34763073ae6b87f53e9
  • Pointer size: 131 Bytes
  • Size of remote file: 798 kB
examples/weather_ui.png ADDED

Git LFS Details

  • SHA256: c2c5cb9a086b3380bdc7f2ce67f9969334473a2aa047c7b12c16368d7a02c74d
  • Pointer size: 131 Bytes
  • Size of remote file: 203 kB
examples/web_forum.png ADDED

Git LFS Details

  • SHA256: 33bd50305f7f7fc9cc6c826c9ba2d9b11c297bc1a7adc0dca086be81b5a681b1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.41 MB
examples/web_shopping.png ADDED

Git LFS Details

  • SHA256: 2d78b6ab11dfd6e203d37d2b14b8f9c40545e1f560d9900d138a6617805841c4
  • Pointer size: 132 Bytes
  • Size of remote file: 1.6 MB
examples/windows_panel.png ADDED

Git LFS Details

  • SHA256: 07d6515abdbe52516eafe520e4fa62a07887df38080bedf9f19426c1706c2e50
  • Pointer size: 131 Bytes
  • Size of remote file: 545 kB
examples/word.png ADDED

Git LFS Details

  • SHA256: a59fa4ebf780a111145328d8fcfeb59c04cf6f7bb003da04e57d4e54eccb2f30
  • Pointer size: 131 Bytes
  • Size of remote file: 403 kB
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ openai==1.59.3
2
+ gradio==4.44.1
3
+ oss2==2.19.1