Upload 6 files
Browse files- aggregated_accuracy.csv +12 -0
- app.py +308 -0
- ocr_accuracy.csv +12 -0
- qa_accuracy.csv +12 -0
- requirements.txt +3 -0
- temporal_accuracy.csv +12 -0
aggregated_accuracy.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
|
2 |
+
InternVL2-5-2B,86.0,80.7,75.3,61.3,44.7,43.3,28.0,32.0,36.0,30.0,27.3,
|
3 |
+
InternVL2-5-4B,96.7,89.3,86.0,74.7,62.0,47.3,43.3,38.7,42.7,36.7,38.7,
|
4 |
+
InternVL2.5-8B,94.0,90.7,88.7,73.3,50.7,36.7,39.3,28.7,28.7,26.7,24.7,
|
5 |
+
LLaVA-Video-7B,97.0,96.0,94.0,95.0,94.0,70.0,67.0,52.0,43.0,33.0,40.0,28.0
|
6 |
+
Qwen2.5-VL-3B,92.0,91.0,88.0,91.0,89.0,86.0,75.0,63.0,44.0,34.0,36.0,29.0
|
7 |
+
Qwen2.5-VL-7B,96.0,96.7,96.7,95.3,94.7,86.7,86.0,65.3,50.0,39.3,39.3,
|
8 |
+
LLaMA-3.2B-11B,40.0,26.7,19.3,20.7,18.7,16.0,18.7,20.7,18.7,20.7,20.0,
|
9 |
+
VideoLLaMA3-7B,60.0,77.3,79.3,69.3,90.7,85.3,75.3,62.7,54.0,40.7,44.0,
|
10 |
+
Gemini 2.5 pro,97.3,97.3,98.7,98.0,97.3,96.7,98.0,96.7,88.0,70.0,70.7,65.3
|
11 |
+
LongVA-7B,87.3,87.3,74.7,79.3,76.0,58.7,50.0,46.0,26.7,30.0,30.0,25.3
|
12 |
+
ChatGPT 4.1,95.3,90.7,78.7,60.0,51.3,41.3,38.7,23.3,30.0,22.7,24.7,32.0
|
app.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Gradio demo – visualise benchmark accuracy curves.
|
3 |
+
|
4 |
+
Required CSV files (place in the *same* folder as app.py):
|
5 |
+
|
6 |
+
├── aggregated_accuracy.csv
|
7 |
+
├── qa_accuracy.csv
|
8 |
+
├── ocr_accuracy.csv
|
9 |
+
└── temporal_accuracy.csv
|
10 |
+
|
11 |
+
Each file has the columns
|
12 |
+
|
13 |
+
Model,<context‑length‑1>,<context‑length‑2>,…
|
14 |
+
|
15 |
+
where the context‑length headers are strings such as `30min`, `60min`, `120min`, …
|
16 |
+
|
17 |
+
No further cleaning / renaming is done apart from two cosmetic replacements
|
18 |
+
(“gpt4.1” → “ChatGPT 4.1”, “gemini2.5pro” → “Gemini 2.5 Pro”).
|
19 |
+
"""
|
20 |
+
|
21 |
+
from pathlib import Path
|
22 |
+
|
23 |
+
import pandas as pd
|
24 |
+
import plotly.graph_objects as go
|
25 |
+
import gradio as gr
|
26 |
+
import math
|
27 |
+
|
28 |
+
# --------------------------------------------------------------------- #
|
29 |
+
# Config #
|
30 |
+
# --------------------------------------------------------------------- #
|
31 |
+
|
32 |
+
FILES = {
|
33 |
+
"aggregated": "aggregated_accuracy.csv",
|
34 |
+
"qa": "qa_accuracy.csv",
|
35 |
+
"ocr": "ocr_accuracy.csv",
|
36 |
+
"temporal": "temporal_accuracy.csv",
|
37 |
+
}
|
38 |
+
|
39 |
+
# Mapping of internal benchmark keys to nicely formatted display labels
|
40 |
+
DISPLAY_LABELS = {
|
41 |
+
"aggregated": "Aggregated",
|
42 |
+
"qa": "QA",
|
43 |
+
"ocr": "OCR",
|
44 |
+
"temporal": "Temporal",
|
45 |
+
}
|
46 |
+
|
47 |
+
# Optional: choose which models are selected by default for each benchmark.
|
48 |
+
# Use the *display names* exactly as they appear in the Models list.
|
49 |
+
# If a benchmark is missing, it falls back to the first six models.
|
50 |
+
DEFAULT_MODELS: dict[str, list[str]] = {
|
51 |
+
"aggregated": [
|
52 |
+
"Gemini 2.5 Pro",
|
53 |
+
"ChatGPT 4.1",
|
54 |
+
"Qwen2.5-VL-7B",
|
55 |
+
"InternVL2.5-8B",
|
56 |
+
"LLaMA-3.2-11B-Vision",
|
57 |
+
],
|
58 |
+
}
|
59 |
+
|
60 |
+
RENAME = {
|
61 |
+
r"gpt4\.1": "ChatGPT 4.1",
|
62 |
+
r"Gemini\s2\.5\spro": "Gemini 2.5 Pro",
|
63 |
+
r"LLaMA-3\.2B-11B": "LLaMA-3.2-11B-Vision",
|
64 |
+
}
|
65 |
+
|
66 |
+
# --------------------------------------------------------------------- #
|
67 |
+
# Data loading #
|
68 |
+
# --------------------------------------------------------------------- #
|
69 |
+
|
70 |
+
def _read_csv(path: str | Path) -> pd.DataFrame:
|
71 |
+
df = pd.read_csv(path)
|
72 |
+
df["Model"] = df["Model"].replace(RENAME, regex=True).astype(str)
|
73 |
+
return df
|
74 |
+
|
75 |
+
dfs: dict[str, pd.DataFrame] = {name: _read_csv(path) for name, path in FILES.items()}
|
76 |
+
|
77 |
+
# --------------------------------------------------------------------- #
|
78 |
+
# Colour palette and model metadata #
|
79 |
+
# --------------------------------------------------------------------- #
|
80 |
+
|
81 |
+
import plotly.express as px
|
82 |
+
|
83 |
+
SAFE_PALETTE = px.colors.qualitative.Safe # colour-blind-safe qualitative palette (10 colours)
|
84 |
+
|
85 |
+
# Deterministic list of all unique model names to ensure consistent colour mapping
|
86 |
+
ALL_MODELS: list[str] = sorted({m for df in dfs.values() for m in df["Model"].unique()})
|
87 |
+
|
88 |
+
MARKER_SYMBOLS = [
|
89 |
+
"circle",
|
90 |
+
"square",
|
91 |
+
"triangle-up",
|
92 |
+
"diamond",
|
93 |
+
"cross",
|
94 |
+
"triangle-down",
|
95 |
+
"x",
|
96 |
+
"triangle-right",
|
97 |
+
"triangle-left",
|
98 |
+
"pentagon",
|
99 |
+
]
|
100 |
+
|
101 |
+
TIME_COLS = [c for c in dfs["aggregated"].columns if c.lower() != "model"]
|
102 |
+
|
103 |
+
|
104 |
+
def _pretty_time(label: str) -> str:
|
105 |
+
"""‘30min’ → ‘30min’; ‘120min’ → ‘2hr’; keeps original if no match."""
|
106 |
+
if label.endswith("min"):
|
107 |
+
minutes = int(label[:-3])
|
108 |
+
if minutes >= 60:
|
109 |
+
hours = minutes / 60
|
110 |
+
return f"{hours:.0f}hr" if hours.is_integer() else f"{hours:.1f}hr"
|
111 |
+
return label
|
112 |
+
|
113 |
+
|
114 |
+
TIME_LABELS = {c: _pretty_time(c) for c in TIME_COLS}
|
115 |
+
|
116 |
+
# --------------------------------------------------------------------- #
|
117 |
+
# Plotting #
|
118 |
+
# --------------------------------------------------------------------- #
|
119 |
+
|
120 |
+
def render_chart(
|
121 |
+
benchmark: str,
|
122 |
+
models: list[str],
|
123 |
+
log_scale: bool,
|
124 |
+
) -> go.Figure:
|
125 |
+
bench_key = benchmark.lower()
|
126 |
+
df = dfs[bench_key]
|
127 |
+
fig = go.Figure()
|
128 |
+
|
129 |
+
# Define colour and marker based on deterministic mapping
|
130 |
+
palette = SAFE_PALETTE
|
131 |
+
|
132 |
+
# Determine minimum non-zero Y value across selected models for log scaling
|
133 |
+
min_y_val = None
|
134 |
+
|
135 |
+
for idx, m in enumerate(models):
|
136 |
+
row = df.loc[df["Model"] == m]
|
137 |
+
if row.empty:
|
138 |
+
continue
|
139 |
+
y = row[TIME_COLS].values.flatten()
|
140 |
+
y = [val if val != 0 else None for val in y] # show gaps for 0 / missing
|
141 |
+
|
142 |
+
# Track minimum non-zero accuracy
|
143 |
+
y_non_none = [val for val in y if val is not None]
|
144 |
+
if y_non_none:
|
145 |
+
cur_min = min(y_non_none)
|
146 |
+
if min_y_val is None or cur_min < min_y_val:
|
147 |
+
min_y_val = cur_min
|
148 |
+
|
149 |
+
model_idx = ALL_MODELS.index(m) if m in ALL_MODELS else idx
|
150 |
+
color = palette[model_idx % len(palette)]
|
151 |
+
symbol = MARKER_SYMBOLS[model_idx % len(MARKER_SYMBOLS)]
|
152 |
+
fig.add_trace(
|
153 |
+
go.Scatter(
|
154 |
+
x=[TIME_LABELS[c] for c in TIME_COLS],
|
155 |
+
y=y,
|
156 |
+
mode="lines+markers",
|
157 |
+
name=m,
|
158 |
+
line=dict(width=3, color=color),
|
159 |
+
marker=dict(size=6, color=color, symbol=symbol),
|
160 |
+
connectgaps=False,
|
161 |
+
)
|
162 |
+
)
|
163 |
+
|
164 |
+
# Set Y-axis properties
|
165 |
+
if log_scale:
|
166 |
+
# Fallback to 0.1 if there are no valid points
|
167 |
+
if min_y_val is None or min_y_val <= 0:
|
168 |
+
min_y_val = 0.1
|
169 |
+
# Plotly expects log10 values for range when axis type is "log"
|
170 |
+
yaxis_range = [math.floor(math.log10(min_y_val)), 2] # max at 10^2 = 100
|
171 |
+
yaxis_type = "log"
|
172 |
+
else:
|
173 |
+
yaxis_range = [0, 100]
|
174 |
+
yaxis_type = "linear"
|
175 |
+
|
176 |
+
fig.update_layout(
|
177 |
+
title=f"{DISPLAY_LABELS.get(bench_key, bench_key.capitalize())} Accuracy Over Time",
|
178 |
+
xaxis_title="Video Duration",
|
179 |
+
yaxis_title="Accuracy (%)",
|
180 |
+
yaxis_type=yaxis_type,
|
181 |
+
yaxis_range=yaxis_range,
|
182 |
+
legend_title="Model",
|
183 |
+
legend=dict(
|
184 |
+
orientation="h",
|
185 |
+
y=-0.25,
|
186 |
+
x=0.5,
|
187 |
+
xanchor="center",
|
188 |
+
tracegroupgap=8,
|
189 |
+
itemwidth=60,
|
190 |
+
),
|
191 |
+
margin=dict(t=40, r=20, b=80, l=60),
|
192 |
+
template="plotly_dark",
|
193 |
+
font=dict(family="Inter,Helvetica,Arial,sans-serif", size=14),
|
194 |
+
title_font=dict(size=20, family="Inter,Helvetica,Arial,sans-serif", color="white"),
|
195 |
+
xaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
|
196 |
+
yaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
|
197 |
+
hoverlabel=dict(bgcolor="#1e1e1e", font_color="#eeeeee", bordercolor="#888"),
|
198 |
+
)
|
199 |
+
return fig
|
200 |
+
|
201 |
+
|
202 |
+
# --------------------------------------------------------------------- #
|
203 |
+
# UI #
|
204 |
+
# --------------------------------------------------------------------- #
|
205 |
+
|
206 |
+
CSS = """
|
207 |
+
#controls {
|
208 |
+
padding: 8px 12px;
|
209 |
+
}
|
210 |
+
.scrollbox {
|
211 |
+
max-height: 300px;
|
212 |
+
overflow-y: auto;
|
213 |
+
}
|
214 |
+
body, .gradio-container {
|
215 |
+
font-family: 'Inter', 'Helvetica', sans-serif;
|
216 |
+
}
|
217 |
+
.gradio-container h1, .gradio-container h2 {
|
218 |
+
font-weight: 600;
|
219 |
+
}
|
220 |
+
|
221 |
+
#controls, .scrollbox {
|
222 |
+
background: rgba(255,255,255,0.02);
|
223 |
+
border-radius: 6px;
|
224 |
+
}
|
225 |
+
|
226 |
+
input[type="checkbox"]:checked {
|
227 |
+
accent-color: #FF715E;
|
228 |
+
}
|
229 |
+
"""
|
230 |
+
|
231 |
+
def available_models(bench: str) -> list[str]:
|
232 |
+
return sorted(dfs[bench]["Model"].unique())
|
233 |
+
|
234 |
+
|
235 |
+
def default_models(bench: str) -> list[str]:
|
236 |
+
"""Return list of default-selected models for a benchmark."""
|
237 |
+
opts = available_models(bench)
|
238 |
+
configured = DEFAULT_MODELS.get(bench, [])
|
239 |
+
# Keep only those present in opts
|
240 |
+
valid = [m for m in configured if m in opts]
|
241 |
+
if not valid:
|
242 |
+
# Fall back to first six
|
243 |
+
valid = opts[:6]
|
244 |
+
return valid
|
245 |
+
|
246 |
+
|
247 |
+
with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo:
|
248 |
+
gr.Markdown(
|
249 |
+
"""
|
250 |
+
# 📈 TimeScope
|
251 |
+
|
252 |
+
How long can your video model keep up?
|
253 |
+
"""
|
254 |
+
)
|
255 |
+
|
256 |
+
# ---- top controls row ---- #
|
257 |
+
with gr.Row():
|
258 |
+
benchmark_dd = gr.Dropdown(
|
259 |
+
label="Type",
|
260 |
+
choices=list(DISPLAY_LABELS.values()),
|
261 |
+
value=DISPLAY_LABELS["aggregated"],
|
262 |
+
scale=1,
|
263 |
+
)
|
264 |
+
log_cb = gr.Checkbox(
|
265 |
+
label="Log-scale Y-axis",
|
266 |
+
value=False,
|
267 |
+
scale=1,
|
268 |
+
)
|
269 |
+
|
270 |
+
# ---- models list and plot ---- #
|
271 |
+
plot_out = gr.Plot(
|
272 |
+
render_chart("Aggregated", default_models("aggregated"), False)
|
273 |
+
)
|
274 |
+
|
275 |
+
models_cb = gr.CheckboxGroup(
|
276 |
+
label="Models",
|
277 |
+
choices=available_models("aggregated"),
|
278 |
+
value=default_models("aggregated"),
|
279 |
+
interactive=True,
|
280 |
+
elem_classes=["scrollbox"],
|
281 |
+
)
|
282 |
+
|
283 |
+
# ‑-- dynamic callbacks ‑-- #
|
284 |
+
def _update_models(bench: str):
|
285 |
+
bench_key = bench.lower()
|
286 |
+
opts = available_models(bench_key)
|
287 |
+
defaults = default_models(bench_key)
|
288 |
+
# Use generic gr.update for compatibility across Gradio versions
|
289 |
+
return gr.update(choices=opts, value=defaults)
|
290 |
+
|
291 |
+
benchmark_dd.change(
|
292 |
+
fn=_update_models,
|
293 |
+
inputs=benchmark_dd,
|
294 |
+
outputs=models_cb,
|
295 |
+
queue=False,
|
296 |
+
)
|
297 |
+
|
298 |
+
for ctrl in (benchmark_dd, models_cb, log_cb):
|
299 |
+
ctrl.change(
|
300 |
+
fn=render_chart,
|
301 |
+
inputs=[benchmark_dd, models_cb, log_cb],
|
302 |
+
outputs=plot_out,
|
303 |
+
queue=False,
|
304 |
+
)
|
305 |
+
|
306 |
+
# Make legend interaction clearer: click to toggle traces
|
307 |
+
|
308 |
+
demo.launch(share=True)
|
ocr_accuracy.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
|
2 |
+
InternVL2-5-2B,96.0,100.0,94.0,74.0,50.0,46.0,34.0,30.0,50.0,28.0,30.0,
|
3 |
+
InternVL2-5-4B,100.0,94.0,86.0,68.0,56.0,36.0,36.0,36.0,34.0,32.0,28.0,
|
4 |
+
LLaVA-Video-7B,100.0,100.0,96.0,96.0,98.0,68.0,70.0,48.0,34.0,40.0,44.0,34.0
|
5 |
+
Qwen2.5-VL-3B,94.0,86.0,88.0,86.0,84.0,86.0,84.0,88.0,62.0,52.0,48.0,52.0
|
6 |
+
Qwen2.5-VL-7B,98.0,100.0,100.0,96.0,98.0,86.0,96.0,92.0,62.0,46.0,50.0,
|
7 |
+
InternVL2.5-8B,100.0,100.0,96.0,74.0,52.0,38.0,42.0,46.0,40.0,38.0,44.0,
|
8 |
+
LLaMA-3.2B-11B,22.0,0.0,0.0,2.0,0.0,8.0,4.0,4.0,2.0,2.0,4.0,
|
9 |
+
VideoLLaMA3-7B,100.0,100.0,98.0,98.0,100.0,78.0,72.0,56.0,40.0,36.0,38.0,
|
10 |
+
Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
|
11 |
+
LongVA-7B,94.0,94.0,86.0,96.0,90.0,68.0,70.0,66.0,32.0,40.0,24.0,34.0
|
12 |
+
ChatGPT 4.1,96.0,82.0,80.0,74.0,60.0,40.0,48.0,32.0,40.0,26.0,22.0,32.0
|
qa_accuracy.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
|
2 |
+
InternVL2-5-2B,98.0,84.0,78.0,62.0,36.0,28.0,30.0,30.0,24.0,30.0,18.0,
|
3 |
+
InternVL2-5-4B,100.0,90.0,92.0,76.0,56.0,56.0,52.0,50.0,54.0,42.0,46.0,
|
4 |
+
LLaVA-Video-7B,100.0,100.0,100.0,100.0,98.0,66.0,62.0,58.0,62.0,36.0,36.0,38.0
|
5 |
+
Qwen2.5-VL-3B,100.0,98.0,98.0,100.0,98.0,86.0,76.0,54.0,50.0,28.0,42.0,32.0
|
6 |
+
Qwen2.5-VL-7B,100.0,100.0,100.0,100.0,100.0,86.0,84.0,56.0,50.0,54.0,46.0,
|
7 |
+
InternVL2.5-8B,100.0,92.0,94.0,74.0,40.0,38.0,50.0,28.0,32.0,20.0,20.0,
|
8 |
+
LLaMA-3.2B-11B,54.0,44.0,34.0,42.0,38.0,32.0,42.0,38.0,34.0,36.0,38.0,
|
9 |
+
VideoLLaMA3-7B,30.0,58.0,54.0,54.0,82.0,88.0,68.0,52.0,58.0,40.0,40.0,
|
10 |
+
Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
|
11 |
+
LongVA-7B,96.0,78.0,80.0,86.0,76.0,68.0,66.0,50.0,30.0,34.0,30.0,36.0
|
12 |
+
ChatGPT 4.1,100.0,100.0,78.0,68.0,68.0,56.0,48.0,32.0,44.0,40.0,40.0,56.0
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
plotly
|
3 |
+
gradio
|
temporal_accuracy.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
|
2 |
+
InternVL2-5-2B,64.0,58.0,54.0,48.0,48.0,56.0,20.0,36.0,34.0,32.0,34.0,
|
3 |
+
InternVL2-5-4B,90.0,84.0,80.0,80.0,74.0,50.0,42.0,30.0,40.0,36.0,42.0,
|
4 |
+
LLaVA-Video-7B,90.0,88.0,86.0,88.0,86.0,76.0,68.0,50.0,34.0,24.0,40.0,12.0
|
5 |
+
Qwen2.5-VL-3B,84.0,88.0,78.0,88.0,84.0,86.0,64.0,48.0,20.0,22.0,18.0,2.0
|
6 |
+
Qwen2.5-VL-7B,90.0,90.0,90.0,90.0,86.0,88.0,78.0,48.0,38.0,18.0,22.0,
|
7 |
+
InternVL2.5-8B,82.0,80.0,76.0,72.0,60.0,34.0,26.0,12.0,14.0,22.0,10.0,
|
8 |
+
LLaMA-3.2B-11B,44.0,36.0,24.0,18.0,18.0,8.0,10.0,20.0,20.0,24.0,18.0,
|
9 |
+
VideoLLaMA3-7B,50.0,74.0,86.0,56.0,90.0,90.0,86.0,80.0,64.0,46.0,54.0,
|
10 |
+
Gemini 2.5 pro,92.0,96.0,96.0,94.0,94.0,92.0,94.0,92.0,88.0,68.0,58.0,58.0
|
11 |
+
LongVA-7B,88.0,82.0,62.0,74.0,72.0,58.0,50.0,38.0,18.0,14.0,22.0,12.0
|
12 |
+
ChatGPT 4.1,90.0,90.0,78.0,38.0,26.0,28.0,20.0,6.0,6.0,2.0,12.0,8.0
|