File size: 4,680 Bytes
d9f1459
 
 
 
 
 
 
 
 
 
 
 
 
ba2d7a0
 
 
 
 
 
 
 
d9f1459
 
 
 
 
 
 
 
 
 
ba2d7a0
 
 
 
 
 
 
 
 
 
 
d9f1459
 
ba2d7a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ec96acf
 
 
 
 
 
 
 
 
b0d0508
 
 
 
 
ba2d7a0
 
 
 
 
 
d9f1459
b0d0508
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import pandas as pd

# Define the leaderboard data
data = {
    "Model": [
        "DeepSeek-R1 (671B MoE)", "DeepSeek-V3 (671B MoE)", "Llama-3.1 (8B)", "Llama-3.1 (70B)", "Llama-3.1 (405B)",
        "Mistral (7B)", "Mixtral-8x22B (141B MoE)", "Qwen2.5 (7B)", "Qwen2.5 (72B)",
        "Claude 3.5 Haiku", "Claude 3.5 Sonnet", "Gemini 1.5 Flash", "Gemini 2.0 Flash",
        "Gemini 1.5 Pro", "Gemini 2.0 Pro", "GPT-4o",
        "LLaVa-v1.6-Mistral (7B)",
        "Gemini 1.5 Flash (MLLM)", "Gemini 2.0 Flash (MLLM)", "Gemini 1.5 Pro (MLLM)", "Gemini 2.0 Pro (MLLM)", "GPT-4o (MLLM)"
    ],
    "Type": [
        "LLM", "LLM", "LLM", "LLM", "LLM",
        "LLM", "LLM", "LLM", "LLM",
        "LLM", "LLM", "LLM", "LLM",
        "LLM", "LLM", "LLM",
        "MLLM",
        "MLLM", "MLLM", "MLLM", "MLLM", "MLLM"
    ],
    "T (Full)": [63.89, 64.91, 57.80, 63.37, 62.83, 49.94, 57.60, 56.06, 65.65, 56.94, 65.32, 56.90, 58.36, 62.78, 60.75, 57.43, None, 64.91, 64.79, 66.22, 66.42, 64.95],
    "TA (Full)": [45.81, 47.10, 40.69, 45.51, 45.82, 34.78, 39.86, 41.01, 47.42, 40.66, 43.38, 38.20, 37.47, 43.25, 44.59, 37.64, None, 45.06, 40.07, 46.90, 46.17, 44.53],
    "TAC (Full)": [21.29, 23.65, 19.08, 19.29, 22.67, 14.13, 16.49, 19.54, 19.65, 19.75, 22.54, 18.97, 19.20, 21.26, 18.63, 15.35, None, 20.66, 19.74, 23.23, 18.25, 19.60],
    "T (Segmented)": [71.18, 77.56, 71.86, 76.51, 78.60, 63.73, 70.12, 65.57, 79.20, 68.18, 75.90, 72.09, 70.05, 76.56, 74.66, 76.76, 16.46, 86.23, 82.24, 86.01, 86.04, 83.47],
    "TA (Segmented)": [47.29, 51.35, 43.42, 50.64, 49.50, 39.23, 41.75, 38.16, 48.36, 43.61, 46.47, 44.83, 42.11, 46.38, 47.18, 45.90, 11.45, 54.21, 48.00, 53.51, 54.28, 51.15],
    "TAC (Segmented)": [21.36, 28.17, 21.81, 23.96, 25.38, 19.83, 19.56, 20.76, 22.77, 21.54, 24.60, 21.31, 21.43, 22.87, 24.20, 24.50, 3.30, 23.27, 23.52, 24.97, 25.21, 27.86]
}

df = pd.DataFrame(data)

def filter_leaderboard(view="Segmented", model_type="All"):
    columns = ["Model", "Type"]
    if view == "Segmented":
        columns += ["T (Segmented)", "TA (Segmented)", "TAC (Segmented)"]
    else:
        columns += ["T (Full)", "TA (Full)", "TAC (Full)"]
    
    if model_type == "All":
        return df[columns].sort_values(by=columns[-1], ascending=False)
    else:
        return df[df["Type"] == model_type][columns].sort_values(by=columns[-1], ascending=False)

with gr.Blocks(title="VideoConviction LLM Leaderboard") as demo:
    gr.Markdown("# πŸ“Š VideoConviction Benchmark Leaderboard")
    gr.Markdown("""
This leaderboard displays the performance (F1 scores) of various Large Language Models (LLMs) and Multimodal LLMs (MLLMs) on the **VideoConviction** benchmark.

---

### πŸ“˜ About the Benchmark
**VideoConviction** is a multimodal benchmark designed to evaluate **human conviction** and **stock market recommendation quality** from video content. It integrates vision, audio, and text to evaluate reasoning and persuasion detection in financial decision-making.

πŸ“ **Paper**: [VideoConviction: A Multimodal Benchmark for Human Conviction and Stock Market Recommendations](https://doi.org/10.1145/3711896.3737417)  
πŸ“ **Conference**: ACM SIGKDD 2025  
πŸ‘₯ **Authors**: Michael Galarnyk, Veer Kejriwal, Agam Shah, Yash Bhardwaj, Nicholas Watney Meyer, Anand Krishnan, Sudheer Chava

---

### πŸ“„ Citation
```bibtex
@inproceedings{galarnyk2025videoconviction,
  author = {Michael Galarnyk and Veer Kejriwal and Agam Shah and Yash Bhardwaj and Nicholas Watney Meyer and Anand Krishnan and Sudheer Chava},
  title = {VideoConviction: A Multimodal Benchmark for Human Conviction and Stock Market Recommendations},
  booktitle = {Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2 (KDD '25)},
  year = {2025},
  location = {Toronto, ON, Canada},
  pages = {12},
  publisher = {ACM},
  doi = {10.1145/3711896.3737417}
}
```""")

with gr.Row():
    with gr.Column():
        view_dropdown = gr.Dropdown(
            label="Select View Type", choices=["Segmented", "Full"], value="Segmented"
        )
    with gr.Column():
        model_dropdown = gr.Dropdown(
            label="Filter by Model Type", choices=["All", "LLM", "MLLM"], value="All"
        )

leaderboard = gr.Dataframe(
    value=filter_leaderboard("Segmented", "All"),
    label="Leaderboard",
    interactive=False
)

def update_leaderboard(view, model_type):
    return filter_leaderboard(view, model_type)

view_dropdown.change(fn=update_leaderboard, inputs=[view_dropdown, model_dropdown], outputs=leaderboard)
model_dropdown.change(fn=update_leaderboard, inputs=[view_dropdown, model_dropdown], outputs=leaderboard)

demo.launch()