Spaces:

orrzohar
/

test

Sleeping

App Files Files Community

orrzohar commited on Jul 10

Commit

0c1ac1a

verified ·

1 Parent(s): 0e75f52

Upload 6 files

Browse files

Files changed (6) hide show

aggregated_accuracy.csv +12 -0
app.py +308 -0
ocr_accuracy.csv +12 -0
qa_accuracy.csv +12 -0
requirements.txt +3 -0
temporal_accuracy.csv +12 -0

aggregated_accuracy.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
+InternVL2-5-2B,86.0,80.7,75.3,61.3,44.7,43.3,28.0,32.0,36.0,30.0,27.3,
+InternVL2-5-4B,96.7,89.3,86.0,74.7,62.0,47.3,43.3,38.7,42.7,36.7,38.7,
+InternVL2.5-8B,94.0,90.7,88.7,73.3,50.7,36.7,39.3,28.7,28.7,26.7,24.7,
+LLaVA-Video-7B,97.0,96.0,94.0,95.0,94.0,70.0,67.0,52.0,43.0,33.0,40.0,28.0
+Qwen2.5-VL-3B,92.0,91.0,88.0,91.0,89.0,86.0,75.0,63.0,44.0,34.0,36.0,29.0
+Qwen2.5-VL-7B,96.0,96.7,96.7,95.3,94.7,86.7,86.0,65.3,50.0,39.3,39.3,
+LLaMA-3.2B-11B,40.0,26.7,19.3,20.7,18.7,16.0,18.7,20.7,18.7,20.7,20.0,
+VideoLLaMA3-7B,60.0,77.3,79.3,69.3,90.7,85.3,75.3,62.7,54.0,40.7,44.0,
+Gemini 2.5 pro,97.3,97.3,98.7,98.0,97.3,96.7,98.0,96.7,88.0,70.0,70.7,65.3
+LongVA-7B,87.3,87.3,74.7,79.3,76.0,58.7,50.0,46.0,26.7,30.0,30.0,25.3
+ChatGPT 4.1,95.3,90.7,78.7,60.0,51.3,41.3,38.7,23.3,30.0,22.7,24.7,32.0

app.py ADDED Viewed

	@@ -0,0 +1,308 @@

+"""
+Gradio demo – visualise benchmark accuracy curves.
+Required CSV files (place in the *same* folder as app.py):
+    ├── aggregated_accuracy.csv
+    ├── qa_accuracy.csv
+    ├── ocr_accuracy.csv
+    └── temporal_accuracy.csv
+Each file has the columns
+    Model,<context‑length‑1>,<context‑length‑2>,…
+where the context‑length headers are strings such as `30min`, `60min`, `120min`, …
+No further cleaning / renaming is done apart from two cosmetic replacements
+(“gpt4.1” → “ChatGPT 4.1”, “gemini2.5pro” → “Gemini 2.5 Pro”).
+"""
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+import gradio as gr
+import math
+# --------------------------------------------------------------------- #
+# Config                                                                #
+# --------------------------------------------------------------------- #
+FILES = {
+    "aggregated": "aggregated_accuracy.csv",
+    "qa": "qa_accuracy.csv",
+    "ocr": "ocr_accuracy.csv",
+    "temporal": "temporal_accuracy.csv",
+}
+# Mapping of internal benchmark keys to nicely formatted display labels
+DISPLAY_LABELS = {
+    "aggregated": "Aggregated",
+    "qa": "QA",
+    "ocr": "OCR",
+    "temporal": "Temporal",
+}
+# Optional: choose which models are selected by default for each benchmark.
+# Use the *display names* exactly as they appear in the Models list.
+# If a benchmark is missing, it falls back to the first six models.
+DEFAULT_MODELS: dict[str, list[str]] = {
+    "aggregated": [
+        "Gemini 2.5 Pro",
+        "ChatGPT 4.1",
+        "Qwen2.5-VL-7B",
+        "InternVL2.5-8B",
+        "LLaMA-3.2-11B-Vision",
+    ],
+}
+RENAME = {
+    r"gpt4\.1":       "ChatGPT 4.1",
+    r"Gemini\s2\.5\spro": "Gemini 2.5 Pro",
+    r"LLaMA-3\.2B-11B": "LLaMA-3.2-11B-Vision",
+}
+# --------------------------------------------------------------------- #
+# Data loading                                                          #
+# --------------------------------------------------------------------- #
+def _read_csv(path: str | Path) -> pd.DataFrame:
+    df = pd.read_csv(path)
+    df["Model"] = df["Model"].replace(RENAME, regex=True).astype(str)
+    return df
+dfs: dict[str, pd.DataFrame] = {name: _read_csv(path) for name, path in FILES.items()}
+# --------------------------------------------------------------------- #
+# Colour palette and model metadata                                     #
+# --------------------------------------------------------------------- #
+import plotly.express as px
+SAFE_PALETTE = px.colors.qualitative.Safe  # colour-blind-safe qualitative palette (10 colours)
+# Deterministic list of all unique model names to ensure consistent colour mapping
+ALL_MODELS: list[str] = sorted({m for df in dfs.values() for m in df["Model"].unique()})
+MARKER_SYMBOLS = [
+    "circle",
+    "square",
+    "triangle-up",
+    "diamond",
+    "cross",
+    "triangle-down",
+    "x",
+    "triangle-right",
+    "triangle-left",
+    "pentagon",
+]
+TIME_COLS = [c for c in dfs["aggregated"].columns if c.lower() != "model"]
+def _pretty_time(label: str) -> str:
+    """‘30min’ → ‘30min’; ‘120min’ → ‘2hr’; keeps original if no match."""
+    if label.endswith("min"):
+        minutes = int(label[:-3])
+        if minutes >= 60:
+            hours = minutes / 60
+            return f"{hours:.0f}hr" if hours.is_integer() else f"{hours:.1f}hr"
+    return label
+TIME_LABELS = {c: _pretty_time(c) for c in TIME_COLS}
+# --------------------------------------------------------------------- #
+# Plotting                                                              #
+# --------------------------------------------------------------------- #
+def render_chart(
+    benchmark: str,
+    models: list[str],
+    log_scale: bool,
+) -> go.Figure:
+    bench_key = benchmark.lower()
+    df = dfs[bench_key]
+    fig = go.Figure()
+    # Define colour and marker based on deterministic mapping
+    palette = SAFE_PALETTE
+    # Determine minimum non-zero Y value across selected models for log scaling
+    min_y_val = None
+    for idx, m in enumerate(models):
+        row = df.loc[df["Model"] == m]
+        if row.empty:
+            continue
+        y = row[TIME_COLS].values.flatten()
+        y = [val if val != 0 else None for val in y]   # show gaps for 0 / missing
+        # Track minimum non-zero accuracy
+        y_non_none = [val for val in y if val is not None]
+        if y_non_none:
+            cur_min = min(y_non_none)
+            if min_y_val is None or cur_min < min_y_val:
+                min_y_val = cur_min
+        model_idx = ALL_MODELS.index(m) if m in ALL_MODELS else idx
+        color = palette[model_idx % len(palette)]
+        symbol = MARKER_SYMBOLS[model_idx % len(MARKER_SYMBOLS)]
+        fig.add_trace(
+            go.Scatter(
+                x=[TIME_LABELS[c] for c in TIME_COLS],
+                y=y,
+                mode="lines+markers",
+                name=m,
+                line=dict(width=3, color=color),
+                marker=dict(size=6, color=color, symbol=symbol),
+                connectgaps=False,
+            )
+        )
+    # Set Y-axis properties
+    if log_scale:
+        # Fallback to 0.1 if there are no valid points
+        if min_y_val is None or min_y_val <= 0:
+            min_y_val = 0.1
+        # Plotly expects log10 values for range when axis type is "log"
+        yaxis_range = [math.floor(math.log10(min_y_val)), 2]  # max at 10^2 = 100
+        yaxis_type = "log"
+    else:
+        yaxis_range = [0, 100]
+        yaxis_type = "linear"
+    fig.update_layout(
+         title=f"{DISPLAY_LABELS.get(bench_key, bench_key.capitalize())} Accuracy Over Time",
+         xaxis_title="Video Duration",
+         yaxis_title="Accuracy (%)",
+         yaxis_type=yaxis_type,
+         yaxis_range=yaxis_range,
+         legend_title="Model",
+        legend=dict(
+            orientation="h",
+            y=-0.25,
+            x=0.5,
+            xanchor="center",
+            tracegroupgap=8,
+            itemwidth=60,
+        ),
+        margin=dict(t=40, r=20, b=80, l=60),
+        template="plotly_dark",
+        font=dict(family="Inter,Helvetica,Arial,sans-serif", size=14),
+        title_font=dict(size=20, family="Inter,Helvetica,Arial,sans-serif", color="white"),
+        xaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
+        yaxis=dict(gridcolor="rgba(255,255,255,0.15)"),
+        hoverlabel=dict(bgcolor="#1e1e1e", font_color="#eeeeee", bordercolor="#888"),
+    )
+    return fig
+# --------------------------------------------------------------------- #
+# UI                                                                    #
+# --------------------------------------------------------------------- #
+CSS = """
+#controls {
+    padding: 8px 12px;
+}
+.scrollbox {
+    max-height: 300px;
+    overflow-y: auto;
+}
+body, .gradio-container {
+    font-family: 'Inter', 'Helvetica', sans-serif;
+}
+.gradio-container h1, .gradio-container h2 {
+   font-weight: 600;
+}
+#controls, .scrollbox {
+    background: rgba(255,255,255,0.02);
+    border-radius: 6px;
+}
+input[type="checkbox"]:checked {
+    accent-color: #FF715E;
+}
+"""
+def available_models(bench: str) -> list[str]:
+    return sorted(dfs[bench]["Model"].unique())
+def default_models(bench: str) -> list[str]:
+    """Return list of default-selected models for a benchmark."""
+    opts = available_models(bench)
+    configured = DEFAULT_MODELS.get(bench, [])
+    # Keep only those present in opts
+    valid = [m for m in configured if m in opts]
+    if not valid:
+        # Fall back to first six
+        valid = opts[:6]
+    return valid
+with gr.Blocks(theme=gr.themes.Base(), css=CSS) as demo:
+    gr.Markdown(
+        """
+        # 📈 TimeScope
+        How long can your video model keep up?
+        """
+    )
+    # ---- top controls row ---- #
+    with gr.Row():
+        benchmark_dd = gr.Dropdown(
+            label="Type",
+            choices=list(DISPLAY_LABELS.values()),
+            value=DISPLAY_LABELS["aggregated"],
+            scale=1,
+        )
+        log_cb = gr.Checkbox(
+            label="Log-scale Y-axis",
+            value=False,
+            scale=1,
+        )
+    # ---- models list and plot ---- #
+    plot_out = gr.Plot(
+        render_chart("Aggregated", default_models("aggregated"), False)
+    )
+    models_cb = gr.CheckboxGroup(
+        label="Models",
+        choices=available_models("aggregated"),
+        value=default_models("aggregated"),
+        interactive=True,
+        elem_classes=["scrollbox"],
+    )
+    # ‑-- dynamic callbacks ‑-- #
+    def _update_models(bench: str):
+        bench_key = bench.lower()
+        opts = available_models(bench_key)
+        defaults = default_models(bench_key)
+        # Use generic gr.update for compatibility across Gradio versions
+        return gr.update(choices=opts, value=defaults)
+    benchmark_dd.change(
+        fn=_update_models,
+        inputs=benchmark_dd,
+        outputs=models_cb,
+        queue=False,
+    )
+    for ctrl in (benchmark_dd, models_cb, log_cb):
+        ctrl.change(
+            fn=render_chart,
+            inputs=[benchmark_dd, models_cb, log_cb],
+            outputs=plot_out,
+            queue=False,
+        )
+    # Make legend interaction clearer: click to toggle traces
+demo.launch(share=True)

ocr_accuracy.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
+InternVL2-5-2B,96.0,100.0,94.0,74.0,50.0,46.0,34.0,30.0,50.0,28.0,30.0,
+InternVL2-5-4B,100.0,94.0,86.0,68.0,56.0,36.0,36.0,36.0,34.0,32.0,28.0,
+LLaVA-Video-7B,100.0,100.0,96.0,96.0,98.0,68.0,70.0,48.0,34.0,40.0,44.0,34.0
+Qwen2.5-VL-3B,94.0,86.0,88.0,86.0,84.0,86.0,84.0,88.0,62.0,52.0,48.0,52.0
+Qwen2.5-VL-7B,98.0,100.0,100.0,96.0,98.0,86.0,96.0,92.0,62.0,46.0,50.0,
+InternVL2.5-8B,100.0,100.0,96.0,74.0,52.0,38.0,42.0,46.0,40.0,38.0,44.0,
+LLaMA-3.2B-11B,22.0,0.0,0.0,2.0,0.0,8.0,4.0,4.0,2.0,2.0,4.0,
+VideoLLaMA3-7B,100.0,100.0,98.0,98.0,100.0,78.0,72.0,56.0,40.0,36.0,38.0,
+Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
+LongVA-7B,94.0,94.0,86.0,96.0,90.0,68.0,70.0,66.0,32.0,40.0,24.0,34.0
+ChatGPT 4.1,96.0,82.0,80.0,74.0,60.0,40.0,48.0,32.0,40.0,26.0,22.0,32.0

qa_accuracy.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
+InternVL2-5-2B,98.0,84.0,78.0,62.0,36.0,28.0,30.0,30.0,24.0,30.0,18.0,
+InternVL2-5-4B,100.0,90.0,92.0,76.0,56.0,56.0,52.0,50.0,54.0,42.0,46.0,
+LLaVA-Video-7B,100.0,100.0,100.0,100.0,98.0,66.0,62.0,58.0,62.0,36.0,36.0,38.0
+Qwen2.5-VL-3B,100.0,98.0,98.0,100.0,98.0,86.0,76.0,54.0,50.0,28.0,42.0,32.0
+Qwen2.5-VL-7B,100.0,100.0,100.0,100.0,100.0,86.0,84.0,56.0,50.0,54.0,46.0,
+InternVL2.5-8B,100.0,92.0,94.0,74.0,40.0,38.0,50.0,28.0,32.0,20.0,20.0,
+LLaMA-3.2B-11B,54.0,44.0,34.0,42.0,38.0,32.0,42.0,38.0,34.0,36.0,38.0,
+VideoLLaMA3-7B,30.0,58.0,54.0,54.0,82.0,88.0,68.0,52.0,58.0,40.0,40.0,
+Gemini 2.5 pro,100.0,100.0,100.0,100.0,100.0,98.0,100.0,98.0,88.0,76.0,84.0,70.0
+LongVA-7B,96.0,78.0,80.0,86.0,76.0,68.0,66.0,50.0,30.0,34.0,30.0,36.0
+ChatGPT 4.1,100.0,100.0,78.0,68.0,68.0,56.0,48.0,32.0,44.0,40.0,40.0,56.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+pandas
+plotly
+gradio

temporal_accuracy.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Model,1min,2min,3min,5min,10min,20min,30min,60min,120min,180min,300min,480min
+InternVL2-5-2B,64.0,58.0,54.0,48.0,48.0,56.0,20.0,36.0,34.0,32.0,34.0,
+InternVL2-5-4B,90.0,84.0,80.0,80.0,74.0,50.0,42.0,30.0,40.0,36.0,42.0,
+LLaVA-Video-7B,90.0,88.0,86.0,88.0,86.0,76.0,68.0,50.0,34.0,24.0,40.0,12.0
+Qwen2.5-VL-3B,84.0,88.0,78.0,88.0,84.0,86.0,64.0,48.0,20.0,22.0,18.0,2.0
+Qwen2.5-VL-7B,90.0,90.0,90.0,90.0,86.0,88.0,78.0,48.0,38.0,18.0,22.0,
+InternVL2.5-8B,82.0,80.0,76.0,72.0,60.0,34.0,26.0,12.0,14.0,22.0,10.0,
+LLaMA-3.2B-11B,44.0,36.0,24.0,18.0,18.0,8.0,10.0,20.0,20.0,24.0,18.0,
+VideoLLaMA3-7B,50.0,74.0,86.0,56.0,90.0,90.0,86.0,80.0,64.0,46.0,54.0,
+Gemini 2.5 pro,92.0,96.0,96.0,94.0,94.0,92.0,94.0,92.0,88.0,68.0,58.0,58.0
+LongVA-7B,88.0,82.0,62.0,74.0,72.0,58.0,50.0,38.0,18.0,14.0,22.0,12.0
+ChatGPT 4.1,90.0,90.0,78.0,38.0,26.0,28.0,20.0,6.0,6.0,2.0,12.0,8.0