Usage
ONNXRuntime
First, define the read_gif_frames helper function (click to expand):
import numpy as np
from PIL import Image, ImageSequence
import requests
from io import BytesIO
import os
def read_gif_frames(path_or_url, shortest_edge=None, center_crop=None):
# Load GIF from URL or local path
if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
response = requests.get(path_or_url)
gif = Image.open(BytesIO(response.content))
elif os.path.exists(path_or_url):
gif = Image.open(path_or_url)
else:
raise ValueError("Invalid URL or file path")
# Ensure it's a GIF
if gif.format != "GIF":
raise ValueError("Not a GIF file")
# Extract frames and convert to RGB
frames = []
for frame in ImageSequence.Iterator(gif):
rgb_frame = frame.convert("RGB") # Force 3 channels
# Resize if specified
if shortest_edge is not None:
w, h = rgb_frame.size
if h < w:
new_h = shortest_edge
new_w = int(w * shortest_edge / h)
else:
new_w = shortest_edge
new_h = int(h * shortest_edge / w)
rgb_frame = rgb_frame.resize((new_w, new_h), Image.LANCZOS)
# Center crop if specified
if center_crop is not None:
w, h = rgb_frame.size
left = (w - center_crop) // 2
top = (h - center_crop) // 2
right = left + center_crop
bottom = top + center_crop
rgb_frame = rgb_frame.crop((left, top, right, bottom))
frame_np = np.array(rgb_frame, dtype=np.uint8)
frame_np = np.transpose(frame_np, (2, 0, 1)) # HWC -> CHW
frames.append(frame_np)
return np.stack(frames) # Shape: [num_frames, 3, height, width]
You can then run the model as follows:
import onnxruntime as ort
from huggingface_hub import hf_hub_download
from transformers import AutoConfig
model_id = "onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX"
config = AutoConfig.from_pretrained(model_id)
path = hf_hub_download(
repo_id=model_id,
filename="onnx/model.onnx",
)
ort_session = ort.InferenceSession(path)
# Load and preprocess video frames
video = read_gif_frames(
"http://www.svcl.ucsd.edu/projects/resound/imgs/19.gif",
shortest_edge=292,
center_crop=256,
)
mean = np.array([0.485, 0.456, 0.406]).reshape(3, 1, 1)
std = np.array([0.229, 0.224, 0.225]).reshape(3, 1, 1)
inputs = {
"pixel_values_videos": ((video / 255 - mean) / std)[np.newaxis, ...].astype(np.float32)
}
# Run the model
logits = ort_session.run(
None,
input_feed=inputs,
)[0]
top_k = 5
indices = np.argsort(logits[0])[-top_k:][::-1]
# Calculate softmax probabilities
exp_logits = np.exp(logits[0] - np.max(logits[0]))
softmax_probs = exp_logits / np.sum(exp_logits)
print(f"Top {top_k} predicted class names:")
for idx in indices:
text_label = config.id2label[idx]
print(f" - {text_label}: {softmax_probs[idx]:.2f}")
Example output:
Top 5 predicted class names:
- ['Forward', '15som', 'NoTwis', 'PIKE']: 0.69
- ['Reverse', 'Dive', 'NoTwis', 'PIKE']: 0.22
- ['Inward', '15som', 'NoTwis', 'PIKE']: 0.06
- ['Reverse', '15som', '05Twis', 'FREE']: 0.01
- ['Forward', '25som', 'NoTwis', 'PIKE']: 0.00
- Downloads last month
- 24
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support
Model tree for onnx-community/vjepa2-vitl-fpc32-256-diving48-ONNX
Base model
facebook/vjepa2-vitl-fpc64-256
Finetuned
facebook/vjepa2-vitl-fpc32-256-diving48