allenai/Molmo-72B-0924 · Point output issues wit Molmo-72B

First of all, thank you for sharing this amazing model to the public.

After playing with the model for a while, Molmo-7B-D-0924 works as expected, but the bigger model, Molmo-72B-0924 struggles to output accurate points in my end. I am using 8 A100 40GB GPUs. The points output from Molmo-72B sometimes look good, but they can be either on top of the objects or totally random. Here are some examples:

Prompt	Molmo-7B-D	Molmo-72B
Point out airplanes in this image
Point out cars in this image

Here are a minimal version of code to repreduce the error:

from vllm import LLM, SamplingParams
import PIL
import re
import numpy as np
import cv2
from urllib.request import urlopen
import io

llm = LLM(model='allenai/Molmo-7B-D-0924', 
          trust_remote_code=True, 
          tensor_parallel_size=4)

# llm = LLM(model='allenai/Molmo-72B-0924', 
#           trust_remote_code=True, 
#           tensor_parallel_size=8)

sampling_params = SamplingParams(max_tokens=1000)

# Refer to the HuggingFace repo for the correct format to use
prompt = "Point out all cars in the image"

img_urls = [
    "https://www.shutterstock.com/shutterstock/videos/1110804299/thumb/1.jpg"
]
def extract_points(molmo_output, image_w, image_h):
    all_points = []
    for match in re.finditer(r'x\d*="\s*([0-9]+(?:\.[0-9]+)?)"\s+y\d*="\s*([0-9]+(?:\.[0-9]+)?)"', molmo_output):
        try:
            point = [float(match.group(i)) for i in range(1, 3)]
        except ValueError:
            pass
        else:
            point = np.array(point)
            if np.max(point) > 100:
                # Treat as an invalid output
                continue
            point /= 100.0
            point = point * np.array([image_w, image_h])
            point = point.astype(np.int32)
            all_points.append(point)
    return all_points

for idx, url in enumerate(img_urls):

    # Load the image using img_url
    image_pil = PIL.Image.open(urlopen(url))
    img_rgb = np.array(image_pil)

    # Single prompt inference
    outputs = llm.generate({
        "prompt": prompt,
        "multi_modal_data": {"image": image_pil},
    }, sampling_params=sampling_params)

    for o in outputs:
        generated_text = o.outputs[0].text
        
    print(generated_text)

    points = extract_points(generated_text, img_rgb.shape[1], img_rgb.shape[0])
    print(points)
    # Draw the points on the image
    for point in points:
        cv2.circle(img_rgb, point, 5, (0, 255, 0), -1)
        
    # Write the answer on the top left corner
    cv2.putText(img_rgb, generated_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # Save the image with points 
    img_path = f'car_molmo-7B_{idx}.png'
    img_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR)

    cv2.imwrite(img_path, img_bgr)

Could you help me examine my code to see if there is adjustable to inference the 72B version? Any feedback is well appreciated.