{ "bomFormat": "CycloneDX", "specVersion": "1.6", "serialNumber": "urn:uuid:3055ead6-57c6-4f53-b237-31c297a9a0db", "version": 1, "metadata": { "timestamp": "2025-06-05T09:41:54.385491+00:00", "component": { "type": "machine-learning-model", "bom-ref": "ByteDance-Seed/UI-TARS-7B-SFT-2563bff1-2155-555e-9ebd-c56f1c3410c5", "name": "ByteDance-Seed/UI-TARS-7B-SFT", "externalReferences": [ { "url": "https://huggingface.co/ByteDance-Seed/UI-TARS-7B-SFT", "type": "documentation" } ], "modelCard": { "modelParameters": { "task": "image-text-to-text", "architectureFamily": "qwen2_vl", "modelArchitecture": "Qwen2VLForConditionalGeneration" }, "properties": [ { "name": "library_name", "value": "transformers" } ] }, "authors": [ { "name": "ByteDance-Seed" } ], "licenses": [ { "license": { "id": "Apache-2.0", "url": "https://spdx.org/licenses/Apache-2.0.html" } } ], "description": "UI-TARS is a next-generation native GUI agent model designed to interact seamlessly with graphical user interfaces (GUIs) using human-like perception, reasoning, and action capabilities. Unlike traditional modular frameworks, UI-TARS integrates all key components\u2014perception, reasoning, grounding, and memory\u2014within a single vision-language model (VLM), enabling end-to-end task automation without predefined workflows or manual rules.

This repository contains the model for the paper [UI-TARS: Pioneering Automated GUI Interaction with Native Agents](https://huggingface.co/papers/2501.12326).Code: https://github.com/bytedance/UI-TARS", "tags": [ "transformers", "safetensors", "qwen2_vl", "image-text-to-text", "multimodal", "gui", "conversational", "en", "arxiv:2501.12326", "license:apache-2.0", "text-generation-inference", "endpoints_compatible", "region:us" ] } } }