microsoft
/

Magma-8B

@@ -84,6 +84,7 @@ from transformers import AutoProcessor
 # Load the model and processor
 model = AutoModelForCausalLM.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
 processor = AutoProcessor.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
 # Inference
 url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
@@ -95,6 +96,8 @@ convs = [
 ]
 prompt = processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
 inputs = processor(images=[image], texts=prompt, return_tensors="pt")
 inputs = inputs.to("cuda")
 with torch.inference_mode():
@@ -303,9 +306,26 @@ We evaluate the model's performance after finetuning on the following datasets:
 We follow the individual dataset's evaluation metrics for the evaluation. Please refer to the original dataset for more details.
-### Results
-TBD
 <!-- {{ results | default("[More Information Needed]", true)}} -->
 #### Summary

 # Load the model and processor
 model = AutoModelForCausalLM.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
 processor = AutoProcessor.from_pretrained("microsoft/Magma-8B", trust_remote_code=True)
+model.to("cuda")
 # Inference
 url = "https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png"
 ]
 prompt = processor.tokenizer.apply_chat_template(convs, tokenize=False, add_generation_prompt=True)
 inputs = processor(images=[image], texts=prompt, return_tensors="pt")
+inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0)
+inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0)
 inputs = inputs.to("cuda")
 with torch.inference_mode():
 We follow the individual dataset's evaluation metrics for the evaluation. Please refer to the original dataset for more details.
+### Results on Agentic Intelligence
+Zero-shot evaluation on agentic intelligence. We report the results for pretrained Magma without any domain-specific finetuning. Magma is the only model that can conduct the full task spectrum.
+| Model                 | Size  | VQAv2 | TextVQA | POPE  | SS-Mobile | SS-Desktop | SS-Web | VWB-Ele-G | VWB-Act-G | SE-Google Robot | SE-Bridge |
+|-----------------------|------|------|--------|------|----------|-----------|------|----------|----------|---------------|-----------|
+| GPT-4V               | n/a  | 77.2 | 78.0 | n/a  | 22.6/24.5 | 20.2/11.8 | 9.2/8.8 | 67.5 | 75.7 | - | - |
+| GPT-4V-OmniParser    | n/a  | n/a  | n/a    | n/a  | 92.7/49.4 | 64.9/26.3 | 77.3/39.7 | - | - | - | - |
+| LLava-1.5           | 7.4B | 78.5 | 58.2   | 85.9  | -  | - | -  | 12.1 | 13.6 | - | - |
+| LLava-Next          | 7.4B | 81.3 | 64.9   | 86.5  | -  | - | -  | 15.0 | 8.7 | - | - |
+| Qwen-VL             | 9.6B | 78.8 | 63.8   | n/a  | 7.5/4.8  | 7.5/5.0 | 3.5/2.4  | 14.0 | 0.7 | - | - |
+| Qwen-VL-Chat        | 9.6B | 78.2 | 61.5   | n/a  | -  | - | -  | - | - | - | - |
+| Fuyu                | 8B   | 74.2 | n/a    | n/a  | 41.0/1.3 | 38.0/3.6 | 33.9/4.4 | 19.4 | 15.5 | - | - |
+| SeeClick            | 9.6B | -    | -      | -    | 78.0/52.0 | 72.2/30.0 | 55.7/32.5 | 9.9 | 1.9 | - | - |
+| Octo               | 93M  | -    | -      | -    | -  | - | -  | - | - | - | - |
+| RT-1-X             | 35M  | -    | -      | -    | -  | - | -  | - | - | 6.0 | 15.9 |
+| OpenVLA            | 8B   | -    | -      | -    | -  | - | -  | - | - | 34.2 | 1.1 |
+| Magma-8B (Ours)    | 8.6B | 80.0 | 66.5 | 87.4 | 60.4/58.5 | 75.3/52.9 | 69.1/52.0 | 96.3 | 71.8 | 52.3 | 35.4 |
 <!-- {{ results | default("[More Information Needed]", true)}} -->
 #### Summary