update readme
Browse files
README.md
CHANGED
@@ -84,6 +84,7 @@ from transformers import AutoProcessor
|
|
84 |
# Load the model and processor
|
85 |
model = AutoModelForCausalLM.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
|
86 |
processor = AutoProcessor.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
|
|
|
87 |
|
88 |
# Inference
|
89 |
url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
|
@@ -95,6 +96,8 @@ convs = [
|
|
95 |
]
|
96 |
prompt = processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
|
97 |
inputs = processor(images=[image], texts=prompt, return_tensors="pt")
|
|
|
|
|
98 |
inputs = inputs.to("cuda")
|
99 |
|
100 |
with torch.inference_mode():
|
@@ -303,9 +306,26 @@ We evaluate the model's performance after finetuning on the following datasets:
|
|
303 |
We follow the individual dataset's evaluation metrics for the evaluation. Please refer to the original dataset for more details.
|
304 |
|
305 |
|
306 |
-
### Results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
|
308 |
-
TBD
|
309 |
<!-- {{ results | default("[More Information Needed]", true)}} -->
|
310 |
|
311 |
#### Summary
|
|
|
84 |
# Load the model and processor
|
85 |
model = AutoModelForCausalLM.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
|
86 |
processor = AutoProcessor.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
|
87 |
+
model.to("cuda")
|
88 |
|
89 |
# Inference
|
90 |
url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
|
|
|
96 |
]
|
97 |
prompt = processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
|
98 |
inputs = processor(images=[image], texts=prompt, return_tensors="pt")
|
99 |
+
inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
|
100 |
+
inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
|
101 |
inputs = inputs.to("cuda")
|
102 |
|
103 |
with torch.inference_mode():
|
|
|
306 |
We follow the individual dataset's evaluation metrics for the evaluation. Please refer to the original dataset for more details.
|
307 |
|
308 |
|
309 |
+
### Results on Agentic Intelligence
|
310 |
+
|
311 |
+
Zero-shot evaluation on agentic intelligence. We report the results for pretrained Magma without any domain-specific finetuning. Magma is the only model that can conduct the full task spectrum.
|
312 |
+
|
313 |
+
| Model | Size | VQAv2 | TextVQA | POPE | SS-Mobile | SS-Desktop | SS-Web | VWB-Ele-G | VWB-Act-G | SE-Google Robot | SE-Bridge |
|
314 |
+
|-----------------------|------|------|--------|------|----------|-----------|------|----------|----------|---------------|-----------|
|
315 |
+
| GPT-4V | n/a | 77.2 | 78.0 | n/a | 22.6/24.5 | 20.2/11.8 | 9.2/8.8 | 67.5 | 75.7 | - | - |
|
316 |
+
| GPT-4V-OmniParser | n/a | n/a | n/a | n/a | 92.7/49.4 | 64.9/26.3 | 77.3/39.7 | - | - | - | - |
|
317 |
+
| LLava-1.5 | 7.4B | 78.5 | 58.2 | 85.9 | - | - | - | 12.1 | 13.6 | - | - |
|
318 |
+
| LLava-Next | 7.4B | 81.3 | 64.9 | 86.5 | - | - | - | 15.0 | 8.7 | - | - |
|
319 |
+
| Qwen-VL | 9.6B | 78.8 | 63.8 | n/a | 7.5/4.8 | 7.5/5.0 | 3.5/2.4 | 14.0 | 0.7 | - | - |
|
320 |
+
| Qwen-VL-Chat | 9.6B | 78.2 | 61.5 | n/a | - | - | - | - | - | - | - |
|
321 |
+
| Fuyu | 8B | 74.2 | n/a | n/a | 41.0/1.3 | 38.0/3.6 | 33.9/4.4 | 19.4 | 15.5 | - | - |
|
322 |
+
| SeeClick | 9.6B | - | - | - | 78.0/52.0 | 72.2/30.0 | 55.7/32.5 | 9.9 | 1.9 | - | - |
|
323 |
+
| Octo | 93M | - | - | - | - | - | - | - | - | - | - |
|
324 |
+
| RT-1-X | 35M | - | - | - | - | - | - | - | - | 6.0 | 15.9 |
|
325 |
+
| OpenVLA | 8B | - | - | - | - | - | - | - | - | 34.2 | 1.1 |
|
326 |
+
| Magma-8B (Ours) | 8.6B | 80.0 | 66.5 | 87.4 | 60.4/58.5 | 75.3/52.9 | 69.1/52.0 | 96.3 | 71.8 | 52.3 | 35.4 |
|
327 |
+
|
328 |
|
|
|
329 |
<!-- {{ results | default("[More Information Needed]", true)}} -->
|
330 |
|
331 |
#### Summary
|