orrzohar commited on
Commit
0c1ac1a
·
verified ·
1 Parent(s): 0e75f52

Upload 6 files

Browse files
aggregated_accuracy.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
2
+ InternVL2-5-2B,86.0,80.7,75.3,61.3,44.7,43.3,28.0,32.0,36.0,30.0,27.3,
3
+ InternVL2-5-4B,96.7,89.3,86.0,74.7,62.0,47.3,43.3,38.7,42.7,36.7,38.7,
4
+ InternVL2.5-8B,94.0,90.7,88.7,73.3,50.7,36.7,39.3,28.7,28.7,26.7,24.7,
5
+ LLaVA-Video-7B,97.0,96.0,94.0,95.0,94.0,70.0,67.0,52.0,43.0,33.0,40.0,28.0
6
+ Qwen2.5-VL-3B,92.0,91.0,88.0,91.0,89.0,86.0,75.0,63.0,44.0,34.0,36.0,29.0
7
+ Qwen2.5-VL-7B,96.0,96.7,96.7,95.3,94.7,86.7,86.0,65.3,50.0,39.3,39.3,
8
+ LLaMA-3.2B-11B,40.0,26.7,19.3,20.7,18.7,16.0,18.7,20.7,18.7,20.7,20.0,
9
+ VideoLLaMA3-7B,60.0,77.3,79.3,69.3,90.7,85.3,75.3,62.7,54.0,40.7,44.0,
10
+ Gemini 2.5 pro,97.3,97.3,98.7,98.0,97.3,96.7,98.0,96.7,88.0,70.0,70.7,65.3
11
+ LongVA-7B,87.3,87.3,74.7,79.3,76.0,58.7,50.0,46.0,26.7,30.0,30.0,25.3
12
+ ChatGPT 4.1,95.3,90.7,78.7,60.0,51.3,41.3,38.7,23.3,30.0,22.7,24.7,32.0
app.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo – visualise benchmark accuracy curves.
3
+
4
+ Required CSV files (place in the *same* folder as app.py):
5
+
6
+ ├── aggregated_accuracy.csv
7
+ ├── qa_accuracy.csv
8
+ ├── ocr_accuracy.csv
9
+ └── temporal_accuracy.csv
10
+
11
+ Each file has the columns
12
+
13
+ Model,<context‑length‑1>,<context‑length‑2>,…
14
+
15
+ where the context‑length headers are strings such as `30min`, `60min`, `120min`, …
16
+
17
+ No further cleaning / renaming is done apart from two cosmetic replacements
18
+ (“gpt4.1” → “ChatGPT 4.1”, “gemini2.5pro” → “Gemini 2.5 Pro”).
19
+ """
20
+
21
+ from pathlib import Path
22
+
23
+ import pandas as pd
24
+ import plotly.graph_objects as go
25
+ import gradio as gr
26
+ import math
27
+
28
+ # --------------------------------------------------------------------- #
29
+ # Config #
30
+ # --------------------------------------------------------------------- #
31
+
32
+ FILES = {
33
+ "aggregated": "aggregated_accuracy.csv",
34
+ "qa": "qa_accuracy.csv",
35
+ "ocr": "ocr_accuracy.csv",
36
+ "temporal": "temporal_accuracy.csv",
37
+ }
38
+
39
+ # Mapping of internal benchmark keys to nicely formatted display labels
40
+ DISPLAY_LABELS = {
41
+ "aggregated": "Aggregated",
42
+ "qa": "QA",
43
+ "ocr": "OCR",
44
+ "temporal": "Temporal",
45
+ }
46
+
47
+ # Optional: choose which models are selected by default for each benchmark.
48
+ # Use the *display names* exactly as they appear in the Models list.
49
+ # If a benchmark is missing, it falls back to the first six models.
50
+ DEFAULT_MODELS: dict[str, list[str]] = {
51
+ "aggregated": [
52
+ "Gemini 2.5 Pro",
53
+ "ChatGPT 4.1",
54
+ "Qwen2.5-VL-7B",
55
+ "InternVL2.5-8B",
56
+ "LLaMA-3.2-11B-Vision",
57
+ ],
58
+ }
59
+
60
+ RENAME = {
61
+ r"gpt4\.1": "ChatGPT 4.1",
62
+ r"Gemini\s2\.5\spro": "Gemini 2.5 Pro",
63
+ r"LLaMA-3\.2B-11B": "LLaMA-3.2-11B-Vision",
64
+ }
65
+
66
+ # --------------------------------------------------------------------- #
67
+ # Data loading #
68
+ # --------------------------------------------------------------------- #
69
+
70
+ def _read_csv(path: str | Path) -> pd.DataFrame:
71
+ df = pd.read_csv(path)
72
+ df["Model"] = df["Model"].replace(RENAME, regex=True).astype(str)
73
+ return df
74
+
75
+ dfs: dict[str, pd.DataFrame] = {name: _read_csv(path) for name, path in FILES.items()}
76
+
77
+ # --------------------------------------------------------------------- #
78
+ # Colour palette and model metadata #
79
+ # --------------------------------------------------------------------- #
80
+
81
+ import plotly.express as px
82
+
83
+ SAFE_PALETTE = px.colors.qualitative.Safe # colour-blind-safe qualitative palette (10 colours)
84
+
85
+ # Deterministic list of all unique model names to ensure consistent colour mapping
86
+ ALL_MODELS: list[str] = sorted({m for df in dfs.values() for m in df["Model"].unique()})
87
+
88
+ MARKER_SYMBOLS = [
89
+ "circle",
90
+ "square",
91
+ "triangle-up",
92
+ "diamond",
93
+ "cross",
94
+ "triangle-down",
95
+ "x",
96
+ "triangle-right",
97
+ "triangle-left",
98
+ "pentagon",
99
+ ]
100
+
101
+ TIME_COLS = [c for c in dfs["aggregated"].columns if c.lower() != "model"]
102
+
103
+
104
+ def _pretty_time(label: str) -> str:
105
+ """‘30min’ → ‘30min’; ‘120min’ → ‘2hr’; keeps original if no match."""
106
+ if label.endswith("min"):
107
+ minutes = int(label[:-3])
108
+ if minutes >= 60:
109
+ hours = minutes / 60
110
+ return f"{hours:.0f}hr" if hours.is_integer() else f"{hours:.1f}hr"
111
+ return label
112
+
113
+
114
+ TIME_LABELS = {c: _pretty_time(c) for c in TIME_COLS}
115
+
116
+ # --------------------------------------------------------------------- #
117
+ # Plotting #
118
+ # --------------------------------------------------------------------- #
119
+
120
+ def render_chart(
121
+ benchmark: str,
122
+ models: list[str],
123
+ log_scale: bool,
124
+ ) -> go.Figure:
125
+ bench_key = benchmark.lower()
126
+ df = dfs[bench_key]
127
+ fig = go.Figure()
128
+
129
+ # Define colour and marker based on deterministic mapping
130
+ palette = SAFE_PALETTE
131
+
132
+ # Determine minimum non-zero Y value across selected models for log scaling
133
+ min_y_val = None
134
+
135
+ for idx, m in enumerate(models):
136
+ row = df.loc[df["Model"] == m]
137
+ if row.empty:
138
+ continue
139
+ y = row[TIME_COLS].values.flatten()
140
+ y = [val if val != 0 else None for val in y] # show gaps for 0 / missing
141
+
142
+ # Track minimum non-zero accuracy
143
+ y_non_none = [val for val in y if val is not None]
144
+ if y_non_none:
145
+ cur_min = min(y_non_none)
146
+ if min_y_val is None or cur_min < min_y_val:
147
+ min_y_val = cur_min
148
+
149
+ model_idx = ALL_MODELS.index(m) if m in ALL_MODELS else idx
150
+ color = palette[model_idx % len(palette)]
151
+ symbol = MARKER_SYMBOLS[model_idx % len(MARKER_SYMBOLS)]
152
+ fig.add_trace(
153
+ go.Scatter(
154
+ x=[TIME_LABELS[c] for c in TIME_COLS],
155
+ y=y,
156
+ mode="lines+markers",
157
+ name=m,
158
+ line=dict(width=3, color=color),
159
+ marker=dict(size=6, color=color, symbol=symbol),
160
+ connectgaps=False,
161
+ )
162
+ )
163
+
164
+ # Set Y-axis properties
165
+ if log_scale:
166
+ # Fallback to 0.1 if there are no valid points
167
+ if min_y_val is None or min_y_val <= 0:
168
+ min_y_val = 0.1
169
+ # Plotly expects log10 values for range when axis type is "log"
170
+ yaxis_range = [math.floor(math.log10(min_y_val)), 2] # max at 10^2 = 100
171
+ yaxis_type = "log"
172
+ else:
173
+ yaxis_range = [0, 100]
174
+ yaxis_type = "linear"
175
+
176
+ fig.update_layout(
177
+ title=f"{DISPLAY_LABELS.get(bench_key, bench_key.capitalize())} Accuracy Over Time",
178
+ xaxis_title="Video Duration",
179
+ yaxis_title="Accuracy (%)",
180
+ yaxis_type=yaxis_type,
181
+ yaxis_range=yaxis_range,
182
+ legend_title="Model",
183
+ legend=dict(
184
+ orientation="h",
185
+ y=-0.25,
186
+ x=0.5,
187
+ xanchor="center",
188
+ tracegroupgap=8,
189
+ itemwidth=60,
190
+ ),
191
+ margin=dict(t=40, r=20, b=80, l=60),
192
+ template="plotly_dark",
193
+ font=dict(family="Inter,Helvetica,Arial,sans-serif", size=14),
194
+ title_font=dict(size=20, family="Inter,Helvetica,Arial,sans-serif", color="white"),
195
+ xaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
196
+ yaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
197
+ hoverlabel=dict(bgcolor="#1e1e1e", font_color="#eeeeee", bordercolor="#888"),
198
+ )
199
+ return fig
200
+
201
+
202
+ # --------------------------------------------------------------------- #
203
+ # UI #
204
+ # --------------------------------------------------------------------- #
205
+
206
+ CSS = """
207
+ #controls {
208
+ padding: 8px 12px;
209
+ }
210
+ .scrollbox {
211
+ max-height: 300px;
212
+ overflow-y: auto;
213
+ }
214
+ body, .gradio-container {
215
+ font-family: 'Inter', 'Helvetica', sans-serif;
216
+ }
217
+ .gradio-container h1, .gradio-container h2 {
218
+ font-weight: 600;
219
+ }
220
+
221
+ #controls, .scrollbox {
222
+ background: rgba(255,255,255,0.02);
223
+ border-radius: 6px;
224
+ }
225
+
226
+ input[type="checkbox"]:checked {
227
+ accent-color: #FF715E;
228
+ }
229
+ """
230
+
231
+ def available_models(bench: str) -> list[str]:
232
+ return sorted(dfs[bench]["Model"].unique())
233
+
234
+
235
+ def default_models(bench: str) -> list[str]:
236
+ """Return list of default-selected models for a benchmark."""
237
+ opts = available_models(bench)
238
+ configured = DEFAULT_MODELS.get(bench, [])
239
+ # Keep only those present in opts
240
+ valid = [m for m in configured if m in opts]
241
+ if not valid:
242
+ # Fall back to first six
243
+ valid = opts[:6]
244
+ return valid
245
+
246
+
247
+ with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo:
248
+ gr.Markdown(
249
+ """
250
+ # 📈 TimeScope
251
+
252
+ How long can your video model keep up?
253
+ """
254
+ )
255
+
256
+ # ---- top controls row ---- #
257
+ with gr.Row():
258
+ benchmark_dd = gr.Dropdown(
259
+ label="Type",
260
+ choices=list(DISPLAY_LABELS.values()),
261
+ value=DISPLAY_LABELS["aggregated"],
262
+ scale=1,
263
+ )
264
+ log_cb = gr.Checkbox(
265
+ label="Log-scale Y-axis",
266
+ value=False,
267
+ scale=1,
268
+ )
269
+
270
+ # ---- models list and plot ---- #
271
+ plot_out = gr.Plot(
272
+ render_chart("Aggregated", default_models("aggregated"), False)
273
+ )
274
+
275
+ models_cb = gr.CheckboxGroup(
276
+ label="Models",
277
+ choices=available_models("aggregated"),
278
+ value=default_models("aggregated"),
279
+ interactive=True,
280
+ elem_classes=["scrollbox"],
281
+ )
282
+
283
+ # ‑-- dynamic callbacks ‑-- #
284
+ def _update_models(bench: str):
285
+ bench_key = bench.lower()
286
+ opts = available_models(bench_key)
287
+ defaults = default_models(bench_key)
288
+ # Use generic gr.update for compatibility across Gradio versions
289
+ return gr.update(choices=opts, value=defaults)
290
+
291
+ benchmark_dd.change(
292
+ fn=_update_models,
293
+ inputs=benchmark_dd,
294
+ outputs=models_cb,
295
+ queue=False,
296
+ )
297
+
298
+ for ctrl in (benchmark_dd, models_cb, log_cb):
299
+ ctrl.change(
300
+ fn=render_chart,
301
+ inputs=[benchmark_dd, models_cb, log_cb],
302
+ outputs=plot_out,
303
+ queue=False,
304
+ )
305
+
306
+ # Make legend interaction clearer: click to toggle traces
307
+
308
+ demo.launch(share=True)
ocr_accuracy.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
2
+ InternVL2-5-2B,96.0,100.0,94.0,74.0,50.0,46.0,34.0,30.0,50.0,28.0,30.0,
3
+ InternVL2-5-4B,100.0,94.0,86.0,68.0,56.0,36.0,36.0,36.0,34.0,32.0,28.0,
4
+ LLaVA-Video-7B,100.0,100.0,96.0,96.0,98.0,68.0,70.0,48.0,34.0,40.0,44.0,34.0
5
+ Qwen2.5-VL-3B,94.0,86.0,88.0,86.0,84.0,86.0,84.0,88.0,62.0,52.0,48.0,52.0
6
+ Qwen2.5-VL-7B,98.0,100.0,100.0,96.0,98.0,86.0,96.0,92.0,62.0,46.0,50.0,
7
+ InternVL2.5-8B,100.0,100.0,96.0,74.0,52.0,38.0,42.0,46.0,40.0,38.0,44.0,
8
+ LLaMA-3.2B-11B,22.0,0.0,0.0,2.0,0.0,8.0,4.0,4.0,2.0,2.0,4.0,
9
+ VideoLLaMA3-7B,100.0,100.0,98.0,98.0,100.0,78.0,72.0,56.0,40.0,36.0,38.0,
10
+ Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
11
+ LongVA-7B,94.0,94.0,86.0,96.0,90.0,68.0,70.0,66.0,32.0,40.0,24.0,34.0
12
+ ChatGPT 4.1,96.0,82.0,80.0,74.0,60.0,40.0,48.0,32.0,40.0,26.0,22.0,32.0
qa_accuracy.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
2
+ InternVL2-5-2B,98.0,84.0,78.0,62.0,36.0,28.0,30.0,30.0,24.0,30.0,18.0,
3
+ InternVL2-5-4B,100.0,90.0,92.0,76.0,56.0,56.0,52.0,50.0,54.0,42.0,46.0,
4
+ LLaVA-Video-7B,100.0,100.0,100.0,100.0,98.0,66.0,62.0,58.0,62.0,36.0,36.0,38.0
5
+ Qwen2.5-VL-3B,100.0,98.0,98.0,100.0,98.0,86.0,76.0,54.0,50.0,28.0,42.0,32.0
6
+ Qwen2.5-VL-7B,100.0,100.0,100.0,100.0,100.0,86.0,84.0,56.0,50.0,54.0,46.0,
7
+ InternVL2.5-8B,100.0,92.0,94.0,74.0,40.0,38.0,50.0,28.0,32.0,20.0,20.0,
8
+ LLaMA-3.2B-11B,54.0,44.0,34.0,42.0,38.0,32.0,42.0,38.0,34.0,36.0,38.0,
9
+ VideoLLaMA3-7B,30.0,58.0,54.0,54.0,82.0,88.0,68.0,52.0,58.0,40.0,40.0,
10
+ Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
11
+ LongVA-7B,96.0,78.0,80.0,86.0,76.0,68.0,66.0,50.0,30.0,34.0,30.0,36.0
12
+ ChatGPT 4.1,100.0,100.0,78.0,68.0,68.0,56.0,48.0,32.0,44.0,40.0,40.0,56.0
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ plotly
3
+ gradio
temporal_accuracy.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
2
+ InternVL2-5-2B,64.0,58.0,54.0,48.0,48.0,56.0,20.0,36.0,34.0,32.0,34.0,
3
+ InternVL2-5-4B,90.0,84.0,80.0,80.0,74.0,50.0,42.0,30.0,40.0,36.0,42.0,
4
+ LLaVA-Video-7B,90.0,88.0,86.0,88.0,86.0,76.0,68.0,50.0,34.0,24.0,40.0,12.0
5
+ Qwen2.5-VL-3B,84.0,88.0,78.0,88.0,84.0,86.0,64.0,48.0,20.0,22.0,18.0,2.0
6
+ Qwen2.5-VL-7B,90.0,90.0,90.0,90.0,86.0,88.0,78.0,48.0,38.0,18.0,22.0,
7
+ InternVL2.5-8B,82.0,80.0,76.0,72.0,60.0,34.0,26.0,12.0,14.0,22.0,10.0,
8
+ LLaMA-3.2B-11B,44.0,36.0,24.0,18.0,18.0,8.0,10.0,20.0,20.0,24.0,18.0,
9
+ VideoLLaMA3-7B,50.0,74.0,86.0,56.0,90.0,90.0,86.0,80.0,64.0,46.0,54.0,
10
+ Gemini 2.5 pro,92.0,96.0,96.0,94.0,94.0,92.0,94.0,92.0,88.0,68.0,58.0,58.0
11
+ LongVA-7B,88.0,82.0,62.0,74.0,72.0,58.0,50.0,38.0,18.0,14.0,22.0,12.0
12
+ ChatGPT 4.1,90.0,90.0,78.0,38.0,26.0,28.0,20.0,6.0,6.0,2.0,12.0,8.0