onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX

Usage

ONNXRuntime

First, define the read_gif_frames helper function (click to expand):

import numpy as np
from PIL import Image, ImageSequence
import requests
from io import BytesIO
import os

def read_gif_frames(path_or_url, shortest_edge=None, center_crop=None):
    # Load GIF from URL or local path
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        response = requests.get(path_or_url)
        gif = Image.open(BytesIO(response.content))
    elif os.path.exists(path_or_url):
        gif = Image.open(path_or_url)
    else:
        raise ValueError("Invalid URL or file path")

    # Ensure it's a GIF
    if gif.format != "GIF":
        raise ValueError("Not a GIF file")

    # Extract frames and convert to RGB
    frames = []
    for frame in ImageSequence.Iterator(gif):
        rgb_frame = frame.convert("RGB")  # Force 3 channels
        
        # Resize if specified
        if shortest_edge is not None:
            w, h = rgb_frame.size
            if h < w:
                new_h = shortest_edge
                new_w = int(w * shortest_edge / h)
            else:
                new_w = shortest_edge
                new_h = int(h * shortest_edge / w)
            rgb_frame = rgb_frame.resize((new_w, new_h), Image.LANCZOS)
        
        # Center crop if specified
        if center_crop is not None:
            w, h = rgb_frame.size
            left = (w - center_crop) // 2
            top = (h - center_crop) // 2
            right = left + center_crop
            bottom = top + center_crop
            rgb_frame = rgb_frame.crop((left, top, right, bottom))
        
        frame_np = np.array(rgb_frame, dtype=np.uint8)
        frame_np = np.transpose(frame_np, (2, 0, 1))  # HWC -> CHW
        frames.append(frame_np)

    return np.stack(frames)  # Shape: [num_frames, 3, height, width]

You can then run the model as follows:

import onnxruntime as ort
from huggingface_hub import hf_hub_download
from transformers import AutoConfig

model_id = "onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX"
config = AutoConfig.from_pretrained(model_id)
path = hf_hub_download(
    repo_id=model_id,
    filename="onnx/model.onnx",
)
ort_session = ort.InferenceSession(path)

# Load and preprocess video frames
video = read_gif_frames(
    "http://www.svcl.ucsd.edu/projects/resound/imgs/19.gif",
    shortest_edge=292,
    center_crop=256,
)
mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
inputs = {
    "pixel_values_videos": ((video / 255 - mean) / std)[np.newaxis, ...].astype(np.float32)
}

# Run the model
logits = ort_session.run(
    None,
    input_feed=inputs,
)[0]

top_k = 5
indices = np.argsort(logits[0])[-top_k:][::-1]

# Calculate softmax probabilities
exp_logits = np.exp(logits[0] - np.max(logits[0]))
softmax_probs = exp_logits / np.sum(exp_logits)

print(f"Top {top_k} predicted class names:")
for idx in indices:
    text_label = config.id2label[idx]
    print(f" - {text_label}: {softmax_probs[idx]:.2f}")

Example output:

Top 5 predicted class names:
 - ['Forward', '15som', 'NoTwis', 'PIKE']: 0.69
 - ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.22
 - ['Inward', '15som', 'NoTwis', 'PIKE']: 0.06
 - ['Reverse', '15som', '05Twis', 'FREE']: 0.01
 - ['Forward', '25som', 'NoTwis', 'PIKE']: 0.00

onnx-community
/

vjepa2-vitl-fpc32-256-diving48-ONNX

Usage

ONNXRuntime

Model tree for onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX