Ben Ern
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -7,4 +7,102 @@ base_model:
|
|
7 |
- allenai/Molmo-7B-D-0924
|
8 |
pipeline_tag: image-text-to-text
|
9 |
library_name: transformers
|
10 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
- allenai/Molmo-7B-D-0924
|
8 |
pipeline_tag: image-text-to-text
|
9 |
library_name: transformers
|
10 |
+
---
|
11 |
+
# ELAM-7B
|
12 |
+
|
13 |
+
ELAM (Evaluative Large Action Model) is a Molmo 7B-D-based LAM (Large Action Model) that is also able to evaluate user expectations on screenshots of user interfaces. It was specifically fine-tuned on 17,708 automotive UI images in German and English.
|
14 |
+
The evaluation dataset [AutomotiveUI-Bench-4K](https://huggingface.co/datasets/sparks-solutions/AutomotiveUI-Bench-4K) is available on Hugging Face.
|
15 |
+
|
16 |
+
# Quick-Start
|
17 |
+
```
|
18 |
+
conda create -n elam python=3.10 -y
|
19 |
+
conda activate elam
|
20 |
+
pip install datasets==3.5.0 einops==0.8.1 torchvision==0.20.1 accelerate==1.6.0
|
21 |
+
pip install transformers==4.48.2
|
22 |
+
```
|
23 |
+
|
24 |
+
```python
|
25 |
+
import re
|
26 |
+
|
27 |
+
import torch
|
28 |
+
from PIL import Image
|
29 |
+
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
30 |
+
|
31 |
+
# Load processor
|
32 |
+
model_name = "sparks-solutions/ELAM-7B"
|
33 |
+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, torch_dtype="bfloat16", device_map="auto")
|
34 |
+
|
35 |
+
# Load model
|
36 |
+
model = AutoModelForCausalLM.from_pretrained(
|
37 |
+
model_name, trust_remote_code=True, torch_dtype="bfloat16", device_map="auto"
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
+
def preprocess_elam_prompt(user_request: str, label_class: str):
|
42 |
+
"""Apply ELAM prompt template depending on class."""
|
43 |
+
if label_class == "Expected Result":
|
44 |
+
return f"Evaluate this statement about the image:\n'{user_request}'\nThink step by step, conclude whether the evaluation is 'PASSED' or 'FAILED' and point to the UI element that corresponds to this evaluation."
|
45 |
+
elif label_class == "Test Action":
|
46 |
+
return f"Identify and point to the UI element that corresponds to this test action:\n{user_request}"
|
47 |
+
|
48 |
+
|
49 |
+
def postprocess_response_elam(response: str):
|
50 |
+
"""Parse Molmo-style point coordinates from string and return tuple of floats in [0-1]."""
|
51 |
+
pattern = r'<point x="(?P<x>\d+\.\d+)" y="(?P<y>\d+\.\d+)"'
|
52 |
+
match = re.search(pattern, response)
|
53 |
+
if match:
|
54 |
+
x_coord_raw = float(match.group("x"))
|
55 |
+
y_coord_raw = float(match.group("y"))
|
56 |
+
x_coord = x_coord_raw / 100
|
57 |
+
y_coord = y_coord_raw / 100
|
58 |
+
return [x_coord, y_coord]
|
59 |
+
else:
|
60 |
+
return [-1, -1]
|
61 |
+
|
62 |
+
```
|
63 |
+
|
64 |
+
Two prompt types were fine-tuned for UI testing:
|
65 |
+
|
66 |
+
1. *Test Action*: These prompts take an instruction (e.g., "tap music not in bottom navigation bar") and return the corresponding tap coordinates.
|
67 |
+
2. *Expected Results*: These prompts take an expectation (e.g., "notification toggle switch is disabled") and return "PASSED" or "FAILED" along with coordinates of the relevant UI element.
|
68 |
+
|
69 |
+
```python
|
70 |
+
|
71 |
+
image_path = "path/to/your/ui/image"
|
72 |
+
user_request = "Tap home button" # or "The home icon is white"
|
73 |
+
request_type = "Test Action" # or "Expected Result"
|
74 |
+
|
75 |
+
|
76 |
+
image = Image.open(image_path)
|
77 |
+
|
78 |
+
elam_prompt = preprocess_elam_prompt(user_request, request_type)
|
79 |
+
|
80 |
+
inputs = processor.process(
|
81 |
+
images=[image],
|
82 |
+
text=elam_prompt,
|
83 |
+
)
|
84 |
+
|
85 |
+
# Move inputs to the correct device and make a batch of size 1, cast to bfloat16
|
86 |
+
inputs_bfloat16 = {}
|
87 |
+
for k, v in inputs.items():
|
88 |
+
if v.dtype == torch.float32:
|
89 |
+
inputs_bfloat16[k] = v.to(model.device).to(torch.bfloat16).unsqueeze(0)
|
90 |
+
else:
|
91 |
+
inputs_bfloat16[k] = v.to(model.device).unsqueeze(0)
|
92 |
+
|
93 |
+
inputs = inputs_bfloat16 # Replace original inputs with the correctly typed inputs
|
94 |
+
|
95 |
+
# Generate output
|
96 |
+
output = model.generate_from_batch(
|
97 |
+
inputs, GenerationConfig(max_new_tokens=2048, stop_strings="<|endoftext|>"), tokenizer=processor.tokenizer
|
98 |
+
)
|
99 |
+
|
100 |
+
# Only get generated tokens; decode them to text
|
101 |
+
generated_tokens = output[0, inputs["input_ids"].size(1) :]
|
102 |
+
response = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
|
103 |
+
coordinates = postprocess_response_elam(response)
|
104 |
+
|
105 |
+
# Print outputs
|
106 |
+
print(f"ELAM response: {response}")
|
107 |
+
print(f"Got coordinates: {coordinates}")
|
108 |
+
```
|