init leaderboard
Browse files- .gitignore +2 -0
- README.md +26 -5
- app.py +215 -0
- gen_table.py +235 -0
- meta_data.py +101 -0
- requirements.txt +3 -0
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*ipynb
|
| 2 |
+
__pycache__
|
README.md
CHANGED
|
@@ -1,12 +1,33 @@
|
|
| 1 |
---
|
| 2 |
title: Openvlm Subjective Leaderboard
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 5.28.0
|
| 8 |
app_file: app.py
|
| 9 |
-
pinned:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Openvlm Subjective Leaderboard
|
| 3 |
+
emoji: 🌎
|
| 4 |
+
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
sdk: gradio
|
|
|
|
| 7 |
app_file: app.py
|
| 8 |
+
pinned: true
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
tags:
|
| 11 |
+
- leaderboard
|
| 12 |
+
short_description: 'VLMEvalKit Evaluation Subjectivce Benchmark Results Collection '
|
| 13 |
+
sdk_version: 4.44.1
|
| 14 |
---
|
| 15 |
|
| 16 |
+
In this leaderboard, we display subjective benchmark evaluation results obtained with VLMEvalKit.
|
| 17 |
+
|
| 18 |
+
Github: https://github.com/open-compass/VLMEvalKit
|
| 19 |
+
Report: https://arxiv.org/abs/2407.11691
|
| 20 |
+
|
| 21 |
+
Please consider to cite the report if the resource is useful to your research:
|
| 22 |
+
|
| 23 |
+
```BibTex
|
| 24 |
+
@misc{duan2024vlmevalkitopensourcetoolkitevaluating,
|
| 25 |
+
title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models},
|
| 26 |
+
author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Amit Agarwal and Zhe Chen and Mo Li and Yubo Ma and Hailong Sun and Xiangyu Zhao and Junbo Cui and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen},
|
| 27 |
+
year={2024},
|
| 28 |
+
eprint={2407.11691},
|
| 29 |
+
archivePrefix={arXiv},
|
| 30 |
+
primaryClass={cs.CV},
|
| 31 |
+
url={https://arxiv.org/abs/2407.11691},
|
| 32 |
+
}
|
| 33 |
+
```
|
app.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import abc
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
from gen_table import *
|
| 5 |
+
from meta_data import *
|
| 6 |
+
|
| 7 |
+
# import pandas as pd
|
| 8 |
+
# pd.set_option('display.max_colwidth', 0)
|
| 9 |
+
|
| 10 |
+
head_style = """
|
| 11 |
+
<style>
|
| 12 |
+
@media (min-width: 1536px)
|
| 13 |
+
{
|
| 14 |
+
.gradio-container {
|
| 15 |
+
min-width: var(--size-full) !important;
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
</style>
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
with gr.Blocks(title="OpenVLM Subjective Leaderboard", head=head_style) as demo:
|
| 22 |
+
struct = load_results()
|
| 23 |
+
timestamp = struct['time']
|
| 24 |
+
EVAL_TIME = format_timestamp(timestamp)
|
| 25 |
+
results = struct['results']
|
| 26 |
+
N_MODEL = len(results)
|
| 27 |
+
N_DATA = len(results['Phi-4-Vision']) - 1
|
| 28 |
+
DATASETS = list(results['Phi-4-Vision'])
|
| 29 |
+
DATASETS.remove('META')
|
| 30 |
+
print(DATASETS)
|
| 31 |
+
|
| 32 |
+
gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
|
| 33 |
+
structs = [abc.abstractproperty() for _ in range(N_DATA)]
|
| 34 |
+
|
| 35 |
+
with gr.Tabs(elem_classes='tab-buttons') as tabs:
|
| 36 |
+
with gr.TabItem('🏅 OpenVLM Subjective Leaderboard', elem_id='main', id=0):
|
| 37 |
+
gr.Markdown(LEADERBOARD_MD['MAIN'])
|
| 38 |
+
_, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
|
| 39 |
+
table = generate_table(results, DEFAULT_BENCH)
|
| 40 |
+
table['Rank'] = list(range(1, len(table) + 1))
|
| 41 |
+
|
| 42 |
+
type_map = check_box['type_map']
|
| 43 |
+
type_map['Rank'] = 'number'
|
| 44 |
+
|
| 45 |
+
checkbox_group = gr.CheckboxGroup(
|
| 46 |
+
choices=check_box['all'],
|
| 47 |
+
value=check_box['required'],
|
| 48 |
+
label='Evaluation Dimension',
|
| 49 |
+
interactive=True,
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
headers = ['Rank'] + check_box['essential'] + checkbox_group.value
|
| 53 |
+
with gr.Row():
|
| 54 |
+
model_name = gr.Textbox(
|
| 55 |
+
value='Input the Model Name (fuzzy, case insensitive)',
|
| 56 |
+
label='Model Name',
|
| 57 |
+
interactive=True,
|
| 58 |
+
visible=True)
|
| 59 |
+
model_size = gr.CheckboxGroup(
|
| 60 |
+
choices=MODEL_SIZE,
|
| 61 |
+
value=MODEL_SIZE,
|
| 62 |
+
label='Model Size',
|
| 63 |
+
interactive=True
|
| 64 |
+
)
|
| 65 |
+
model_type = gr.CheckboxGroup(
|
| 66 |
+
choices=MODEL_TYPE,
|
| 67 |
+
value=MODEL_TYPE,
|
| 68 |
+
label='Model Type',
|
| 69 |
+
interactive=True
|
| 70 |
+
)
|
| 71 |
+
data_component = gr.components.DataFrame(
|
| 72 |
+
value=table[headers],
|
| 73 |
+
type='pandas',
|
| 74 |
+
datatype=[type_map[x] for x in headers],
|
| 75 |
+
interactive=False,
|
| 76 |
+
wrap=True,
|
| 77 |
+
visible=True)
|
| 78 |
+
|
| 79 |
+
def filter_df(fields, model_name, model_size, model_type):
|
| 80 |
+
filter_list = ['Avg Score', 'Avg Rank', 'OpenSource']
|
| 81 |
+
headers = ['Rank'] + check_box['essential'] + fields
|
| 82 |
+
|
| 83 |
+
new_fields = [field for field in fields if field not in filter_list]
|
| 84 |
+
df = generate_table(results, new_fields)
|
| 85 |
+
|
| 86 |
+
df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
|
| 87 |
+
df = df[df['flag']]
|
| 88 |
+
df.pop('flag')
|
| 89 |
+
if len(df):
|
| 90 |
+
df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
| 91 |
+
df = df[df['flag']]
|
| 92 |
+
df.pop('flag')
|
| 93 |
+
df['Rank'] = list(range(1, len(df) + 1))
|
| 94 |
+
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
| 95 |
+
if model_name != default_val:
|
| 96 |
+
print(model_name)
|
| 97 |
+
model_name = model_name.lower()
|
| 98 |
+
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
|
| 99 |
+
flag = [model_name in name for name in method_names]
|
| 100 |
+
df['TEMP_FLAG'] = flag
|
| 101 |
+
df = df[df['TEMP_FLAG'] == True]
|
| 102 |
+
df.pop('TEMP_FLAG')
|
| 103 |
+
|
| 104 |
+
comp = gr.components.DataFrame(
|
| 105 |
+
value=df[headers],
|
| 106 |
+
type='pandas',
|
| 107 |
+
datatype=[type_map[x] for x in headers],
|
| 108 |
+
interactive=False,
|
| 109 |
+
wrap=True,
|
| 110 |
+
visible=True)
|
| 111 |
+
return comp
|
| 112 |
+
|
| 113 |
+
for cbox in [checkbox_group, model_size, model_type]:
|
| 114 |
+
cbox.change(fn=filter_df, inputs=[checkbox_group, model_name, model_size, model_type], outputs=data_component)
|
| 115 |
+
model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name, model_size, model_type], outputs=data_component)
|
| 116 |
+
|
| 117 |
+
with gr.TabItem('🔍 About', elem_id='about', id=1):
|
| 118 |
+
gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
|
| 119 |
+
|
| 120 |
+
for i, dataset in enumerate(DATASETS):
|
| 121 |
+
with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
|
| 122 |
+
if dataset in LEADERBOARD_MD:
|
| 123 |
+
gr.Markdown(LEADERBOARD_MD[dataset])
|
| 124 |
+
|
| 125 |
+
s = structs[i]
|
| 126 |
+
s.table, s.check_box = BUILD_L2_DF(results, dataset)
|
| 127 |
+
s.type_map = s.check_box['type_map']
|
| 128 |
+
s.type_map['Rank'] = 'number'
|
| 129 |
+
|
| 130 |
+
s.checkbox_group = gr.CheckboxGroup(
|
| 131 |
+
choices=s.check_box['all'],
|
| 132 |
+
value=s.check_box['required'],
|
| 133 |
+
label=f'{dataset} CheckBoxes',
|
| 134 |
+
interactive=True,
|
| 135 |
+
)
|
| 136 |
+
s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
|
| 137 |
+
s.table['Rank'] = list(range(1, len(s.table) + 1))
|
| 138 |
+
|
| 139 |
+
with gr.Row():
|
| 140 |
+
s.model_name = gr.Textbox(
|
| 141 |
+
value='Input the Model Name (fuzzy, case insensitive)',
|
| 142 |
+
label='Model Name',
|
| 143 |
+
interactive=True,
|
| 144 |
+
visible=True)
|
| 145 |
+
s.model_size = gr.CheckboxGroup(
|
| 146 |
+
choices=MODEL_SIZE,
|
| 147 |
+
value=MODEL_SIZE,
|
| 148 |
+
label='Model Size',
|
| 149 |
+
interactive=True
|
| 150 |
+
)
|
| 151 |
+
s.model_type = gr.CheckboxGroup(
|
| 152 |
+
choices=MODEL_TYPE,
|
| 153 |
+
value=MODEL_TYPE,
|
| 154 |
+
label='Model Type',
|
| 155 |
+
interactive=True
|
| 156 |
+
)
|
| 157 |
+
s.data_component = gr.components.DataFrame(
|
| 158 |
+
value=s.table[s.headers],
|
| 159 |
+
type='pandas',
|
| 160 |
+
datatype=[s.type_map[x] for x in s.headers],
|
| 161 |
+
interactive=False,
|
| 162 |
+
wrap=True,
|
| 163 |
+
visible=True)
|
| 164 |
+
s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
|
| 165 |
+
|
| 166 |
+
def filter_df_l2(dataset_name, fields, model_name, model_size, model_type):
|
| 167 |
+
s = structs[DATASETS.index(dataset_name)]
|
| 168 |
+
headers = ['Rank'] + s.check_box['essential'] + fields
|
| 169 |
+
df = cp.deepcopy(s.table)
|
| 170 |
+
df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
|
| 171 |
+
df = df[df['flag']]
|
| 172 |
+
df.pop('flag')
|
| 173 |
+
if len(df):
|
| 174 |
+
df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
|
| 175 |
+
df = df[df['flag']]
|
| 176 |
+
df.pop('flag')
|
| 177 |
+
df['Rank'] = list(range(1, len(df) + 1))
|
| 178 |
+
default_val = 'Input the Model Name (fuzzy, case insensitive)'
|
| 179 |
+
if model_name != default_val:
|
| 180 |
+
print(model_name)
|
| 181 |
+
model_name = model_name.lower()
|
| 182 |
+
method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
|
| 183 |
+
flag = [model_name in name for name in method_names]
|
| 184 |
+
df['TEMP_FLAG'] = flag
|
| 185 |
+
df = df[df['TEMP_FLAG'] == True]
|
| 186 |
+
df.pop('TEMP_FLAG')
|
| 187 |
+
|
| 188 |
+
comp = gr.components.DataFrame(
|
| 189 |
+
value=df[headers],
|
| 190 |
+
type='pandas',
|
| 191 |
+
datatype=[s.type_map[x] for x in headers],
|
| 192 |
+
interactive=False,
|
| 193 |
+
wrap=True,
|
| 194 |
+
visible=True)
|
| 195 |
+
return comp
|
| 196 |
+
|
| 197 |
+
for cbox in [s.checkbox_group, s.model_size, s.model_type]:
|
| 198 |
+
cbox.change(
|
| 199 |
+
fn=filter_df_l2,
|
| 200 |
+
inputs=[s.dataset, s.checkbox_group, s.model_name, s.model_size, s.model_type],
|
| 201 |
+
outputs=s.data_component)
|
| 202 |
+
s.model_name.submit(
|
| 203 |
+
fn=filter_df_l2,
|
| 204 |
+
inputs=[s.dataset, s.checkbox_group, s.model_name, s.model_size, s.model_type],
|
| 205 |
+
outputs=s.data_component)
|
| 206 |
+
|
| 207 |
+
with gr.Row():
|
| 208 |
+
with gr.Accordion('Citation', open=False):
|
| 209 |
+
citation_button = gr.Textbox(
|
| 210 |
+
value=CITATION_BUTTON_TEXT,
|
| 211 |
+
label=CITATION_BUTTON_LABEL,
|
| 212 |
+
elem_id='citation-button')
|
| 213 |
+
|
| 214 |
+
if __name__ == '__main__':
|
| 215 |
+
demo.launch(server_name='0.0.0.0')
|
gen_table.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy as cp
|
| 2 |
+
import json
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
from urllib.request import urlopen
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
+
import pandas as pd
|
| 9 |
+
|
| 10 |
+
from meta_data import DEFAULT_BENCH, META_FIELDS, URL
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def listinstr(lst, s):
|
| 14 |
+
assert isinstance(lst, list)
|
| 15 |
+
for item in lst:
|
| 16 |
+
if item in s:
|
| 17 |
+
return True
|
| 18 |
+
return False
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_results():
|
| 22 |
+
data = json.loads(urlopen(URL).read())
|
| 23 |
+
return data
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def nth_large(val, vals):
|
| 27 |
+
return sum([1 for v in vals if v > val]) + 1
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def format_timestamp(timestamp):
|
| 31 |
+
date = timestamp[:-6]
|
| 32 |
+
time = timestamp[-6:]
|
| 33 |
+
date = date[:-4] + '.' + date[-4:-2] + '.' + date[-2:]
|
| 34 |
+
time = time[:-4] + ':' + time[-4:-2] + ':' + time[-2:]
|
| 35 |
+
return date + ' ' + time
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def model_size_flag(sz, FIELDS):
|
| 39 |
+
if pd.isna(sz) and 'Unknown' in FIELDS:
|
| 40 |
+
return True
|
| 41 |
+
if pd.isna(sz):
|
| 42 |
+
return False
|
| 43 |
+
if '<4B' in FIELDS and sz < 4:
|
| 44 |
+
return True
|
| 45 |
+
if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
|
| 46 |
+
return True
|
| 47 |
+
if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
|
| 48 |
+
return True
|
| 49 |
+
if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
|
| 50 |
+
return True
|
| 51 |
+
if '>40B' in FIELDS and sz >= 40:
|
| 52 |
+
return True
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def model_type_flag(line, FIELDS):
|
| 57 |
+
if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
|
| 58 |
+
return True
|
| 59 |
+
if 'API' in FIELDS and line['OpenSource'] == 'No':
|
| 60 |
+
return True
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def BUILD_L1_DF(results, fields):
|
| 65 |
+
check_box = {}
|
| 66 |
+
check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'Eval Date']
|
| 67 |
+
# revise there to set default dataset
|
| 68 |
+
check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
|
| 69 |
+
check_box['avg'] = ['Avg Score', 'Avg Rank']
|
| 70 |
+
check_box['all'] = check_box['avg'] + fields
|
| 71 |
+
type_map = defaultdict(lambda: 'number')
|
| 72 |
+
type_map['Method'] = 'html'
|
| 73 |
+
type_map['Language Model'] = type_map['Vision Model'] = 'html'
|
| 74 |
+
type_map['OpenSource'] = 'str'
|
| 75 |
+
type_map['Eval Date'] = 'str'
|
| 76 |
+
check_box['type_map'] = type_map
|
| 77 |
+
|
| 78 |
+
df = generate_table(results, fields)
|
| 79 |
+
return df, check_box
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def BUILD_L2_DF(results, dataset):
|
| 83 |
+
res = defaultdict(list)
|
| 84 |
+
sub = [v for v in results.values() if dataset in v]
|
| 85 |
+
assert len(sub)
|
| 86 |
+
fields = list(sub[0][dataset].keys())
|
| 87 |
+
if dataset == 'Creation_MMBench':
|
| 88 |
+
reward_field = [f"{x}:reward" for x in fields]
|
| 89 |
+
vfs_field = [f"{x}:vfs" for x in fields]
|
| 90 |
+
fields = reward_field + vfs_field
|
| 91 |
+
raw_fields = list(sub[0][dataset].keys())
|
| 92 |
+
|
| 93 |
+
print(f'fields: {fields}')
|
| 94 |
+
print(res)
|
| 95 |
+
|
| 96 |
+
non_overall_fields = [x for x in fields if 'Overall' not in x]
|
| 97 |
+
overall_fields = [x for x in fields if 'Overall' in x]
|
| 98 |
+
# if dataset == 'MME':
|
| 99 |
+
# non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
|
| 100 |
+
# overall_fields = overall_fields + ['Perception', 'Cognition']
|
| 101 |
+
# if dataset == 'OCRBench':
|
| 102 |
+
# non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
|
| 103 |
+
# overall_fields = ['Final Score']
|
| 104 |
+
|
| 105 |
+
for m in results:
|
| 106 |
+
item = results[m]
|
| 107 |
+
if dataset not in item:
|
| 108 |
+
continue
|
| 109 |
+
meta = item['META']
|
| 110 |
+
for k in META_FIELDS:
|
| 111 |
+
if k == 'Param (B)':
|
| 112 |
+
param = meta['Parameters']
|
| 113 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
| 114 |
+
elif k == 'Method':
|
| 115 |
+
name, url = meta['Method']
|
| 116 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
| 117 |
+
elif k == 'Eval Date':
|
| 118 |
+
eval_date = meta['Time'].split('/')
|
| 119 |
+
assert len(eval_date) == 3
|
| 120 |
+
eval_date = [x if len(x) > 1 else '0' + x for x in eval_date]
|
| 121 |
+
eval_date = '/'.join(eval_date)
|
| 122 |
+
res[k].append(eval_date)
|
| 123 |
+
else:
|
| 124 |
+
res[k].append(meta[k])
|
| 125 |
+
fields = [x for x in fields]
|
| 126 |
+
|
| 127 |
+
if dataset == 'Creation_MMBench':
|
| 128 |
+
for d in non_overall_fields:
|
| 129 |
+
original_d, data_type = d.split(':')[0], d.split(':')[-1]
|
| 130 |
+
res[d].append(item[dataset][original_d][data_type])
|
| 131 |
+
for d in overall_fields:
|
| 132 |
+
original_d, data_type = d.split(':')[0], d.split(':')[-1]
|
| 133 |
+
res[d].append(item[dataset][original_d][data_type])
|
| 134 |
+
# res[d].append(f"{item[dataset][d]['reward']}/{item[dataset][d]['vfs']}")
|
| 135 |
+
elif dataset in ['MMAlignBench', 'WildVision']:
|
| 136 |
+
for d in non_overall_fields:
|
| 137 |
+
res[d].append(item[dataset][d]['reward'])
|
| 138 |
+
for d in overall_fields:
|
| 139 |
+
res[d].append(item[dataset][d]['reward'])
|
| 140 |
+
else:
|
| 141 |
+
for d in non_overall_fields:
|
| 142 |
+
res[d].append(item[dataset][d])
|
| 143 |
+
for d in overall_fields:
|
| 144 |
+
res[d].append(item[dataset][d])
|
| 145 |
+
|
| 146 |
+
df = pd.DataFrame(res)
|
| 147 |
+
all_fields = overall_fields + non_overall_fields
|
| 148 |
+
# Use the first 5 non-overall fields as required fields
|
| 149 |
+
required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
|
| 150 |
+
|
| 151 |
+
# if dataset == 'OCRBench':
|
| 152 |
+
# df = df.sort_values('Final Score')
|
| 153 |
+
# elif dataset == 'COCO_VAL':
|
| 154 |
+
# df = df.sort_values('CIDEr')
|
| 155 |
+
# elif dataset == 'VCR':
|
| 156 |
+
# df = df.sort_values('Overall-Jaccard')
|
| 157 |
+
# else:
|
| 158 |
+
# df = df.sort_values('Overall')
|
| 159 |
+
df = df.iloc[::-1]
|
| 160 |
+
|
| 161 |
+
check_box = {}
|
| 162 |
+
check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'Eval Date']
|
| 163 |
+
check_box['required'] = required_fields
|
| 164 |
+
check_box['all'] = all_fields
|
| 165 |
+
type_map = defaultdict(lambda: 'number')
|
| 166 |
+
type_map['Method'] = 'html'
|
| 167 |
+
type_map['Language Model'] = type_map['Vision Model'] = 'html'
|
| 168 |
+
type_map['OpenSource'] = 'str'
|
| 169 |
+
type_map['Eval Date'] = 'str'
|
| 170 |
+
check_box['type_map'] = type_map
|
| 171 |
+
return df, check_box
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def generate_table(results, fields):
|
| 175 |
+
|
| 176 |
+
# def get_mmbench_v11(item):
|
| 177 |
+
# assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
|
| 178 |
+
# val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2
|
| 179 |
+
# val = float(f'{val:.1f}')
|
| 180 |
+
# return val
|
| 181 |
+
|
| 182 |
+
res = defaultdict(list)
|
| 183 |
+
## item is each model's meta and results
|
| 184 |
+
for i, m in enumerate(results):
|
| 185 |
+
item = results[m]
|
| 186 |
+
meta = item['META']
|
| 187 |
+
for k in META_FIELDS:
|
| 188 |
+
if k == 'Param (B)':
|
| 189 |
+
param = meta['Parameters']
|
| 190 |
+
res[k].append(float(param.replace('B', '')) if param != '' else None)
|
| 191 |
+
elif k == 'Method':
|
| 192 |
+
name, url = meta['Method']
|
| 193 |
+
res[k].append(f'<a href="{url}">{name}</a>')
|
| 194 |
+
res['name'].append(name)
|
| 195 |
+
elif k == 'Eval Date':
|
| 196 |
+
eval_date = meta['Time'].split('/')
|
| 197 |
+
assert len(eval_date) == 3
|
| 198 |
+
eval_date = [x if len(x) > 1 else '0' + x for x in eval_date]
|
| 199 |
+
eval_date = '/'.join(eval_date)
|
| 200 |
+
res[k].append(eval_date)
|
| 201 |
+
else:
|
| 202 |
+
res[k].append(meta[k])
|
| 203 |
+
scores, ranks = [], []
|
| 204 |
+
# fields is dataset
|
| 205 |
+
for d in fields:
|
| 206 |
+
key_name = 'Overall'
|
| 207 |
+
if d in item:
|
| 208 |
+
if d in ['Creation_MMBench','MMAlignBench', 'WildVision']:
|
| 209 |
+
if d == 'Creation_MMBench':
|
| 210 |
+
res[d].append(f"{item[d][key_name]['reward']}/{item[d][key_name]['vfs']}") # need improve?
|
| 211 |
+
else:
|
| 212 |
+
res[d].append(item[d][key_name]['reward'])
|
| 213 |
+
scores.append((item[d][key_name]['reward']+100)/2)
|
| 214 |
+
ranks.append(nth_large(item[d][key_name]['reward'], [x[d][key_name]['reward'] for x in results.values() if d in x]))
|
| 215 |
+
else:
|
| 216 |
+
res[d].append(item[d][key_name])
|
| 217 |
+
scores.append(item[d][key_name])
|
| 218 |
+
ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
|
| 219 |
+
else:
|
| 220 |
+
res[d].append(None)
|
| 221 |
+
scores.append(None)
|
| 222 |
+
ranks.append(None)
|
| 223 |
+
|
| 224 |
+
res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
|
| 225 |
+
res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
|
| 226 |
+
|
| 227 |
+
df = pd.DataFrame(res)
|
| 228 |
+
valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
|
| 229 |
+
valid = valid.sort_values('Avg Score')
|
| 230 |
+
valid = valid.iloc[::-1]
|
| 231 |
+
if len(fields):
|
| 232 |
+
missing = missing.sort_values(fields[0])
|
| 233 |
+
missing = missing.iloc[::-1]
|
| 234 |
+
df = pd.concat([valid, missing])
|
| 235 |
+
return df
|
meta_data.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CONSTANTS-URL
|
| 2 |
+
URL = "http://opencompass.openxlab.space/assets/OpenVLM_Subjective_Leaderboard.json"
|
| 3 |
+
VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
|
| 4 |
+
# CONSTANTS-CITATION
|
| 5 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{duan2024vlmevalkit,
|
| 6 |
+
title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
|
| 7 |
+
author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
|
| 8 |
+
booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
|
| 9 |
+
pages={11198--11201},
|
| 10 |
+
year={2024}
|
| 11 |
+
}"""
|
| 12 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 13 |
+
# CONSTANTS-TEXT
|
| 14 |
+
LEADERBORAD_INTRODUCTION = """# OpenVLM Subjective Leaderboard
|
| 15 |
+
### Welcome to the OpenVLM Subjective Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
|
| 16 |
+
### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆
|
| 17 |
+
### Currently, OpenVLM Subjective Leaderboard covers {} different VLMs (including GPT4o, Gemini, Qwen2.5-VL, InternVL2.5 etc.) and {} different multi-modal benchmarks.
|
| 18 |
+
|
| 19 |
+
This leaderboard was last updated: {}.
|
| 20 |
+
|
| 21 |
+
OpenVLM Subjective Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, fangxinyu, dingshengyuan]@pjlab.org.cn.
|
| 22 |
+
"""
|
| 23 |
+
# CONSTANTS-FIELDS
|
| 24 |
+
META_FIELDS = [
|
| 25 |
+
'Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Eval Date'
|
| 26 |
+
]
|
| 27 |
+
MAIN_FIELDS = [
|
| 28 |
+
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
|
| 29 |
+
'MMAlignBench', 'MMVet', 'WildVision'
|
| 30 |
+
]
|
| 31 |
+
DEFAULT_BENCH = [
|
| 32 |
+
'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
|
| 33 |
+
'MMAlignBench', 'MMVet', 'WildVision'
|
| 34 |
+
]
|
| 35 |
+
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
|
| 36 |
+
MODEL_TYPE = ['API', 'OpenSource']
|
| 37 |
+
|
| 38 |
+
# The README file for each benchmark
|
| 39 |
+
LEADERBOARD_MD = {}
|
| 40 |
+
|
| 41 |
+
LEADERBOARD_MD['MAIN'] = f"""
|
| 42 |
+
## Main Evaluation Results
|
| 43 |
+
|
| 44 |
+
- Metrics:
|
| 45 |
+
- Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
|
| 46 |
+
- Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
|
| 47 |
+
- Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
|
| 48 |
+
- Metrics For each dataset
|
| 49 |
+
- Creation-MMBench: Reward Score/Visual Factuality Score
|
| 50 |
+
- MMAlignBench, WildVision: Reward Score
|
| 51 |
+
- MIA-Bench, MM-IFEval, MMVet: Overall Score
|
| 52 |
+
- By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
|
| 53 |
+
- The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
|
| 54 |
+
- Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
LEADERBOARD_MD['Creation_MMBench'] = """
|
| 58 |
+
## Creation MMBench Evaluation Results
|
| 59 |
+
|
| 60 |
+
- Creation-MMBench is a multimodal benchmark specifically designed to evaluate the creative capabilities of MLLMs. It consists of **765 test cases**, covering **51 fine-grained tasks** across **4 categories**: *Literary Writing*, *Creative Multimodal Understanding*, *Professional Functionality Writing*, and *Common Functionality Writing*. As an MLLM benchmark, it contains a total of **1001 images** encompassing more than 25 different categories, with some questions incorporating up to 9 images.
|
| 61 |
+
- Creation-MMBench includes carefully crafted **instance-specific criteria** for each test case, enabling assessment of both general response quality and visual-factual alignment in model-generated content. It employs a pair-wise comparison approach, where the model's output is compared with the reference answer (considering both the true answer, input prompt, and visual content) to get the assessment result. The **Dual Evaluation, and GPT-4o judge model** is the evaluation strategy for Creation-MMBench.
|
| 62 |
+
- VFS stands for Visual Factuality Score. The rankings in this leaderboard are arranged in descending order based on the overall reward of each model, with **GPT-4o-1120** providing the **Reference Answer** for comparison, thus serving as the Baseline Model.
|
| 63 |
+
- View More detail on [**Creation-MMBench Official WebPage**](https://open-compass.github.io/Creation-MMBench/)
|
| 64 |
+
"""
|
| 65 |
+
|
| 66 |
+
LEADERBOARD_MD['MM-IFEval'] = """
|
| 67 |
+
## MM-IFEval Evaluation Results
|
| 68 |
+
|
| 69 |
+
- MM-IFEval is a comprehensive multimodal instruction-following benchmark designed to rigorously assess the capabilities of Multimodal Large Language Models (MLLMs). It includes 400 high-quality questions across two levels: 300 compose-level tasks that emphasize output format and content constraints, and 100 perception-level tasks that require precise visual understanding.
|
| 70 |
+
- To ensure accurate evaluation, MM-IFEval employs a hybrid strategy combining rule-based verification with LLM-based judgment models. More details see https://arxiv.org/abs/2504.07957
|
| 71 |
+
- Currently, we use GPT4o (gpt-4o-2024-05-13) when needing an LLM judge model.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
LEADERBOARD_MD['MMAlignBench'] = """
|
| 75 |
+
## MMAlignBench Evaluation Results
|
| 76 |
+
|
| 77 |
+
- MM-AlignBench target for evaluating MLLMs' alignment with human preferences. It includes 252 high-quality, human-annotated samples with diverse image types and open-ended questions. Modeled after Arena-style benchmarks, it uses GPT-4o as the judge model and Claude-Sonnet-3 as the reference model.
|
| 78 |
+
- More Details see https://github.com/PhoenixZ810/OmniAlign-V
|
| 79 |
+
"""
|
| 80 |
+
|
| 81 |
+
LEADERBOARD_MD['MIA-Bench'] = """
|
| 82 |
+
## MIA-Bench Evaluation Results
|
| 83 |
+
|
| 84 |
+
- MIA-Bench contains 400 carefully-crafted image-prompt pairs that stress‐test an MLLM’s ability to **follow layered, exacting instructions** in its responses. ([MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal LLMs](https://arxiv.org/abs/2407.01509), [Towards Better Instruction Following Evaluation of Multimodal LLMs](https://machinelearning.apple.com/research/mia-bench))
|
| 85 |
+
- The leaderboard reports the **overall avg score**. Judge Model is **GPT-4o**.
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
LEADERBOARD_MD['MMVet'] = """
|
| 89 |
+
## MMVet Evaluation Results
|
| 90 |
+
|
| 91 |
+
- In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
|
| 92 |
+
- No specific prompt template adopted for **ALL VLMs**.
|
| 93 |
+
"""
|
| 94 |
+
|
| 95 |
+
LEADERBOARD_MD['WildVision'] = """
|
| 96 |
+
## WildVision Evaluation Results
|
| 97 |
+
|
| 98 |
+
- WildVision-Bench offers **500 real-world multimodal prompts** curated from the WildVision-Arena crowdsourcing platform to benchmark models **by human preference** in natural conversations.
|
| 99 |
+
- The leaderboard lists reports the **overall reward score**.
|
| 100 |
+
- Judge Model is **GPT-4o**. Reference Model is **Claude-Sonnet-3**.
|
| 101 |
+
"""
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.15.0
|
| 2 |
+
numpy>=1.23.4
|
| 3 |
+
pandas>=1.5.3
|