nebulae09 commited on
Commit
0f5b104
·
1 Parent(s): 2c7e4ca

init leaderboard

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. README.md +26 -5
  3. app.py +215 -0
  4. gen_table.py +235 -0
  5. meta_data.py +101 -0
  6. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *ipynb
2
+ __pycache__
README.md CHANGED
@@ -1,12 +1,33 @@
1
  ---
2
  title: Openvlm Subjective Leaderboard
3
- emoji: 🐠
4
- colorFrom: purple
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.28.0
8
  app_file: app.py
9
- pinned: false
 
 
 
 
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Openvlm Subjective Leaderboard
3
+ emoji: 🌎
4
+ colorFrom: blue
5
  colorTo: green
6
  sdk: gradio
 
7
  app_file: app.py
8
+ pinned: true
9
+ license: apache-2.0
10
+ tags:
11
+ - leaderboard
12
+ short_description: 'VLMEvalKit Evaluation Subjectivce Benchmark Results Collection '
13
+ sdk_version: 4.44.1
14
  ---
15
 
16
+ In this leaderboard, we display subjective benchmark evaluation results obtained with VLMEvalKit.
17
+
18
+ Github: https://github.com/open-compass/VLMEvalKit
19
+ Report: https://arxiv.org/abs/2407.11691
20
+
21
+ Please consider to cite the report if the resource is useful to your research:
22
+
23
+ ```BibTex
24
+ @misc{duan2024vlmevalkitopensourcetoolkitevaluating,
25
+ title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models},
26
+ author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Amit Agarwal and Zhe Chen and Mo Li and Yubo Ma and Hailong Sun and Xiangyu Zhao and Junbo Cui and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen},
27
+ year={2024},
28
+ eprint={2407.11691},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.CV},
31
+ url={https://arxiv.org/abs/2407.11691},
32
+ }
33
+ ```
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import abc
2
+ import gradio as gr
3
+
4
+ from gen_table import *
5
+ from meta_data import *
6
+
7
+ # import pandas as pd
8
+ # pd.set_option('display.max_colwidth', 0)
9
+
10
+ head_style = """
11
+ <style>
12
+ @media (min-width: 1536px)
13
+ {
14
+ .gradio-container {
15
+ min-width: var(--size-full) !important;
16
+ }
17
+ }
18
+ </style>
19
+ """
20
+
21
+ with gr.Blocks(title="OpenVLM Subjective Leaderboard", head=head_style) as demo:
22
+ struct = load_results()
23
+ timestamp = struct['time']
24
+ EVAL_TIME = format_timestamp(timestamp)
25
+ results = struct['results']
26
+ N_MODEL = len(results)
27
+ N_DATA = len(results['Phi-4-Vision']) - 1
28
+ DATASETS = list(results['Phi-4-Vision'])
29
+ DATASETS.remove('META')
30
+ print(DATASETS)
31
+
32
+ gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
33
+ structs = [abc.abstractproperty() for _ in range(N_DATA)]
34
+
35
+ with gr.Tabs(elem_classes='tab-buttons') as tabs:
36
+ with gr.TabItem('🏅 OpenVLM Subjective Leaderboard', elem_id='main', id=0):
37
+ gr.Markdown(LEADERBOARD_MD['MAIN'])
38
+ _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
39
+ table = generate_table(results, DEFAULT_BENCH)
40
+ table['Rank'] = list(range(1, len(table) + 1))
41
+
42
+ type_map = check_box['type_map']
43
+ type_map['Rank'] = 'number'
44
+
45
+ checkbox_group = gr.CheckboxGroup(
46
+ choices=check_box['all'],
47
+ value=check_box['required'],
48
+ label='Evaluation Dimension',
49
+ interactive=True,
50
+ )
51
+
52
+ headers = ['Rank'] + check_box['essential'] + checkbox_group.value
53
+ with gr.Row():
54
+ model_name = gr.Textbox(
55
+ value='Input the Model Name (fuzzy, case insensitive)',
56
+ label='Model Name',
57
+ interactive=True,
58
+ visible=True)
59
+ model_size = gr.CheckboxGroup(
60
+ choices=MODEL_SIZE,
61
+ value=MODEL_SIZE,
62
+ label='Model Size',
63
+ interactive=True
64
+ )
65
+ model_type = gr.CheckboxGroup(
66
+ choices=MODEL_TYPE,
67
+ value=MODEL_TYPE,
68
+ label='Model Type',
69
+ interactive=True
70
+ )
71
+ data_component = gr.components.DataFrame(
72
+ value=table[headers],
73
+ type='pandas',
74
+ datatype=[type_map[x] for x in headers],
75
+ interactive=False,
76
+ wrap=True,
77
+ visible=True)
78
+
79
+ def filter_df(fields, model_name, model_size, model_type):
80
+ filter_list = ['Avg Score', 'Avg Rank', 'OpenSource']
81
+ headers = ['Rank'] + check_box['essential'] + fields
82
+
83
+ new_fields = [field for field in fields if field not in filter_list]
84
+ df = generate_table(results, new_fields)
85
+
86
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
87
+ df = df[df['flag']]
88
+ df.pop('flag')
89
+ if len(df):
90
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
91
+ df = df[df['flag']]
92
+ df.pop('flag')
93
+ df['Rank'] = list(range(1, len(df) + 1))
94
+ default_val = 'Input the Model Name (fuzzy, case insensitive)'
95
+ if model_name != default_val:
96
+ print(model_name)
97
+ model_name = model_name.lower()
98
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
99
+ flag = [model_name in name for name in method_names]
100
+ df['TEMP_FLAG'] = flag
101
+ df = df[df['TEMP_FLAG'] == True]
102
+ df.pop('TEMP_FLAG')
103
+
104
+ comp = gr.components.DataFrame(
105
+ value=df[headers],
106
+ type='pandas',
107
+ datatype=[type_map[x] for x in headers],
108
+ interactive=False,
109
+ wrap=True,
110
+ visible=True)
111
+ return comp
112
+
113
+ for cbox in [checkbox_group, model_size, model_type]:
114
+ cbox.change(fn=filter_df, inputs=[checkbox_group, model_name, model_size, model_type], outputs=data_component)
115
+ model_name.submit(fn=filter_df, inputs=[checkbox_group, model_name, model_size, model_type], outputs=data_component)
116
+
117
+ with gr.TabItem('🔍 About', elem_id='about', id=1):
118
+ gr.Markdown(urlopen(VLMEVALKIT_README).read().decode())
119
+
120
+ for i, dataset in enumerate(DATASETS):
121
+ with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
122
+ if dataset in LEADERBOARD_MD:
123
+ gr.Markdown(LEADERBOARD_MD[dataset])
124
+
125
+ s = structs[i]
126
+ s.table, s.check_box = BUILD_L2_DF(results, dataset)
127
+ s.type_map = s.check_box['type_map']
128
+ s.type_map['Rank'] = 'number'
129
+
130
+ s.checkbox_group = gr.CheckboxGroup(
131
+ choices=s.check_box['all'],
132
+ value=s.check_box['required'],
133
+ label=f'{dataset} CheckBoxes',
134
+ interactive=True,
135
+ )
136
+ s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
137
+ s.table['Rank'] = list(range(1, len(s.table) + 1))
138
+
139
+ with gr.Row():
140
+ s.model_name = gr.Textbox(
141
+ value='Input the Model Name (fuzzy, case insensitive)',
142
+ label='Model Name',
143
+ interactive=True,
144
+ visible=True)
145
+ s.model_size = gr.CheckboxGroup(
146
+ choices=MODEL_SIZE,
147
+ value=MODEL_SIZE,
148
+ label='Model Size',
149
+ interactive=True
150
+ )
151
+ s.model_type = gr.CheckboxGroup(
152
+ choices=MODEL_TYPE,
153
+ value=MODEL_TYPE,
154
+ label='Model Type',
155
+ interactive=True
156
+ )
157
+ s.data_component = gr.components.DataFrame(
158
+ value=s.table[s.headers],
159
+ type='pandas',
160
+ datatype=[s.type_map[x] for x in s.headers],
161
+ interactive=False,
162
+ wrap=True,
163
+ visible=True)
164
+ s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
165
+
166
+ def filter_df_l2(dataset_name, fields, model_name, model_size, model_type):
167
+ s = structs[DATASETS.index(dataset_name)]
168
+ headers = ['Rank'] + s.check_box['essential'] + fields
169
+ df = cp.deepcopy(s.table)
170
+ df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
171
+ df = df[df['flag']]
172
+ df.pop('flag')
173
+ if len(df):
174
+ df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
175
+ df = df[df['flag']]
176
+ df.pop('flag')
177
+ df['Rank'] = list(range(1, len(df) + 1))
178
+ default_val = 'Input the Model Name (fuzzy, case insensitive)'
179
+ if model_name != default_val:
180
+ print(model_name)
181
+ model_name = model_name.lower()
182
+ method_names = [x.split('</a>')[0].split('>')[-1].lower() for x in df['Method']]
183
+ flag = [model_name in name for name in method_names]
184
+ df['TEMP_FLAG'] = flag
185
+ df = df[df['TEMP_FLAG'] == True]
186
+ df.pop('TEMP_FLAG')
187
+
188
+ comp = gr.components.DataFrame(
189
+ value=df[headers],
190
+ type='pandas',
191
+ datatype=[s.type_map[x] for x in headers],
192
+ interactive=False,
193
+ wrap=True,
194
+ visible=True)
195
+ return comp
196
+
197
+ for cbox in [s.checkbox_group, s.model_size, s.model_type]:
198
+ cbox.change(
199
+ fn=filter_df_l2,
200
+ inputs=[s.dataset, s.checkbox_group, s.model_name, s.model_size, s.model_type],
201
+ outputs=s.data_component)
202
+ s.model_name.submit(
203
+ fn=filter_df_l2,
204
+ inputs=[s.dataset, s.checkbox_group, s.model_name, s.model_size, s.model_type],
205
+ outputs=s.data_component)
206
+
207
+ with gr.Row():
208
+ with gr.Accordion('Citation', open=False):
209
+ citation_button = gr.Textbox(
210
+ value=CITATION_BUTTON_TEXT,
211
+ label=CITATION_BUTTON_LABEL,
212
+ elem_id='citation-button')
213
+
214
+ if __name__ == '__main__':
215
+ demo.launch(server_name='0.0.0.0')
gen_table.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy as cp
2
+ import json
3
+ from collections import defaultdict
4
+ from urllib.request import urlopen
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from meta_data import DEFAULT_BENCH, META_FIELDS, URL
11
+
12
+
13
+ def listinstr(lst, s):
14
+ assert isinstance(lst, list)
15
+ for item in lst:
16
+ if item in s:
17
+ return True
18
+ return False
19
+
20
+
21
+ def load_results():
22
+ data = json.loads(urlopen(URL).read())
23
+ return data
24
+
25
+
26
+ def nth_large(val, vals):
27
+ return sum([1 for v in vals if v > val]) + 1
28
+
29
+
30
+ def format_timestamp(timestamp):
31
+ date = timestamp[:-6]
32
+ time = timestamp[-6:]
33
+ date = date[:-4] + '.' + date[-4:-2] + '.' + date[-2:]
34
+ time = time[:-4] + ':' + time[-4:-2] + ':' + time[-2:]
35
+ return date + ' ' + time
36
+
37
+
38
+ def model_size_flag(sz, FIELDS):
39
+ if pd.isna(sz) and 'Unknown' in FIELDS:
40
+ return True
41
+ if pd.isna(sz):
42
+ return False
43
+ if '<4B' in FIELDS and sz < 4:
44
+ return True
45
+ if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
46
+ return True
47
+ if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
48
+ return True
49
+ if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
50
+ return True
51
+ if '>40B' in FIELDS and sz >= 40:
52
+ return True
53
+ return False
54
+
55
+
56
+ def model_type_flag(line, FIELDS):
57
+ if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
58
+ return True
59
+ if 'API' in FIELDS and line['OpenSource'] == 'No':
60
+ return True
61
+ return False
62
+
63
+
64
+ def BUILD_L1_DF(results, fields):
65
+ check_box = {}
66
+ check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'Eval Date']
67
+ # revise there to set default dataset
68
+ check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
69
+ check_box['avg'] = ['Avg Score', 'Avg Rank']
70
+ check_box['all'] = check_box['avg'] + fields
71
+ type_map = defaultdict(lambda: 'number')
72
+ type_map['Method'] = 'html'
73
+ type_map['Language Model'] = type_map['Vision Model'] = 'html'
74
+ type_map['OpenSource'] = 'str'
75
+ type_map['Eval Date'] = 'str'
76
+ check_box['type_map'] = type_map
77
+
78
+ df = generate_table(results, fields)
79
+ return df, check_box
80
+
81
+
82
+ def BUILD_L2_DF(results, dataset):
83
+ res = defaultdict(list)
84
+ sub = [v for v in results.values() if dataset in v]
85
+ assert len(sub)
86
+ fields = list(sub[0][dataset].keys())
87
+ if dataset == 'Creation_MMBench':
88
+ reward_field = [f"{x}:reward" for x in fields]
89
+ vfs_field = [f"{x}:vfs" for x in fields]
90
+ fields = reward_field + vfs_field
91
+ raw_fields = list(sub[0][dataset].keys())
92
+
93
+ print(f'fields: {fields}')
94
+ print(res)
95
+
96
+ non_overall_fields = [x for x in fields if 'Overall' not in x]
97
+ overall_fields = [x for x in fields if 'Overall' in x]
98
+ # if dataset == 'MME':
99
+ # non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
100
+ # overall_fields = overall_fields + ['Perception', 'Cognition']
101
+ # if dataset == 'OCRBench':
102
+ # non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
103
+ # overall_fields = ['Final Score']
104
+
105
+ for m in results:
106
+ item = results[m]
107
+ if dataset not in item:
108
+ continue
109
+ meta = item['META']
110
+ for k in META_FIELDS:
111
+ if k == 'Param (B)':
112
+ param = meta['Parameters']
113
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
114
+ elif k == 'Method':
115
+ name, url = meta['Method']
116
+ res[k].append(f'<a href="{url}">{name}</a>')
117
+ elif k == 'Eval Date':
118
+ eval_date = meta['Time'].split('/')
119
+ assert len(eval_date) == 3
120
+ eval_date = [x if len(x) > 1 else '0' + x for x in eval_date]
121
+ eval_date = '/'.join(eval_date)
122
+ res[k].append(eval_date)
123
+ else:
124
+ res[k].append(meta[k])
125
+ fields = [x for x in fields]
126
+
127
+ if dataset == 'Creation_MMBench':
128
+ for d in non_overall_fields:
129
+ original_d, data_type = d.split(':')[0], d.split(':')[-1]
130
+ res[d].append(item[dataset][original_d][data_type])
131
+ for d in overall_fields:
132
+ original_d, data_type = d.split(':')[0], d.split(':')[-1]
133
+ res[d].append(item[dataset][original_d][data_type])
134
+ # res[d].append(f"{item[dataset][d]['reward']}/{item[dataset][d]['vfs']}")
135
+ elif dataset in ['MMAlignBench', 'WildVision']:
136
+ for d in non_overall_fields:
137
+ res[d].append(item[dataset][d]['reward'])
138
+ for d in overall_fields:
139
+ res[d].append(item[dataset][d]['reward'])
140
+ else:
141
+ for d in non_overall_fields:
142
+ res[d].append(item[dataset][d])
143
+ for d in overall_fields:
144
+ res[d].append(item[dataset][d])
145
+
146
+ df = pd.DataFrame(res)
147
+ all_fields = overall_fields + non_overall_fields
148
+ # Use the first 5 non-overall fields as required fields
149
+ required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
150
+
151
+ # if dataset == 'OCRBench':
152
+ # df = df.sort_values('Final Score')
153
+ # elif dataset == 'COCO_VAL':
154
+ # df = df.sort_values('CIDEr')
155
+ # elif dataset == 'VCR':
156
+ # df = df.sort_values('Overall-Jaccard')
157
+ # else:
158
+ # df = df.sort_values('Overall')
159
+ df = df.iloc[::-1]
160
+
161
+ check_box = {}
162
+ check_box['essential'] = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'Eval Date']
163
+ check_box['required'] = required_fields
164
+ check_box['all'] = all_fields
165
+ type_map = defaultdict(lambda: 'number')
166
+ type_map['Method'] = 'html'
167
+ type_map['Language Model'] = type_map['Vision Model'] = 'html'
168
+ type_map['OpenSource'] = 'str'
169
+ type_map['Eval Date'] = 'str'
170
+ check_box['type_map'] = type_map
171
+ return df, check_box
172
+
173
+
174
+ def generate_table(results, fields):
175
+
176
+ # def get_mmbench_v11(item):
177
+ # assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
178
+ # val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2
179
+ # val = float(f'{val:.1f}')
180
+ # return val
181
+
182
+ res = defaultdict(list)
183
+ ## item is each model's meta and results
184
+ for i, m in enumerate(results):
185
+ item = results[m]
186
+ meta = item['META']
187
+ for k in META_FIELDS:
188
+ if k == 'Param (B)':
189
+ param = meta['Parameters']
190
+ res[k].append(float(param.replace('B', '')) if param != '' else None)
191
+ elif k == 'Method':
192
+ name, url = meta['Method']
193
+ res[k].append(f'<a href="{url}">{name}</a>')
194
+ res['name'].append(name)
195
+ elif k == 'Eval Date':
196
+ eval_date = meta['Time'].split('/')
197
+ assert len(eval_date) == 3
198
+ eval_date = [x if len(x) > 1 else '0' + x for x in eval_date]
199
+ eval_date = '/'.join(eval_date)
200
+ res[k].append(eval_date)
201
+ else:
202
+ res[k].append(meta[k])
203
+ scores, ranks = [], []
204
+ # fields is dataset
205
+ for d in fields:
206
+ key_name = 'Overall'
207
+ if d in item:
208
+ if d in ['Creation_MMBench','MMAlignBench', 'WildVision']:
209
+ if d == 'Creation_MMBench':
210
+ res[d].append(f"{item[d][key_name]['reward']}/{item[d][key_name]['vfs']}") # need improve?
211
+ else:
212
+ res[d].append(item[d][key_name]['reward'])
213
+ scores.append((item[d][key_name]['reward']+100)/2)
214
+ ranks.append(nth_large(item[d][key_name]['reward'], [x[d][key_name]['reward'] for x in results.values() if d in x]))
215
+ else:
216
+ res[d].append(item[d][key_name])
217
+ scores.append(item[d][key_name])
218
+ ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
219
+ else:
220
+ res[d].append(None)
221
+ scores.append(None)
222
+ ranks.append(None)
223
+
224
+ res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
225
+ res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
226
+
227
+ df = pd.DataFrame(res)
228
+ valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
229
+ valid = valid.sort_values('Avg Score')
230
+ valid = valid.iloc[::-1]
231
+ if len(fields):
232
+ missing = missing.sort_values(fields[0])
233
+ missing = missing.iloc[::-1]
234
+ df = pd.concat([valid, missing])
235
+ return df
meta_data.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CONSTANTS-URL
2
+ URL = "http://opencompass.openxlab.space/assets/OpenVLM_Subjective_Leaderboard.json"
3
+ VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md'
4
+ # CONSTANTS-CITATION
5
+ CITATION_BUTTON_TEXT = r"""@inproceedings{duan2024vlmevalkit,
6
+ title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
7
+ author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
8
+ booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
9
+ pages={11198--11201},
10
+ year={2024}
11
+ }"""
12
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
13
+ # CONSTANTS-TEXT
14
+ LEADERBORAD_INTRODUCTION = """# OpenVLM Subjective Leaderboard
15
+ ### Welcome to the OpenVLM Subjective Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework:
16
+ ### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆
17
+ ### Currently, OpenVLM Subjective Leaderboard covers {} different VLMs (including GPT4o, Gemini, Qwen2.5-VL, InternVL2.5 etc.) and {} different multi-modal benchmarks.
18
+
19
+ This leaderboard was last updated: {}.
20
+
21
+ OpenVLM Subjective Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, fangxinyu, dingshengyuan]@pjlab.org.cn.
22
+ """
23
+ # CONSTANTS-FIELDS
24
+ META_FIELDS = [
25
+ 'Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Eval Date'
26
+ ]
27
+ MAIN_FIELDS = [
28
+ 'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
29
+ 'MMAlignBench', 'MMVet', 'WildVision'
30
+ ]
31
+ DEFAULT_BENCH = [
32
+ 'Creation_MMBench', 'MIA-Bench', 'MM-IFEval',
33
+ 'MMAlignBench', 'MMVet', 'WildVision'
34
+ ]
35
+ MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
36
+ MODEL_TYPE = ['API', 'OpenSource']
37
+
38
+ # The README file for each benchmark
39
+ LEADERBOARD_MD = {}
40
+
41
+ LEADERBOARD_MD['MAIN'] = f"""
42
+ ## Main Evaluation Results
43
+
44
+ - Metrics:
45
+ - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better).
46
+ - Avg Rank: The average rank on all VLM Benchmarks (the lower the better).
47
+ - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!**
48
+ - Metrics For each dataset
49
+ - Creation-MMBench: Reward Score/Visual Factuality Score
50
+ - MMAlignBench, WildVision: Reward Score
51
+ - MIA-Bench, MM-IFEval, MMVet: Overall Score
52
+ - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score.
53
+ - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}.
54
+ - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs.
55
+ """
56
+
57
+ LEADERBOARD_MD['Creation_MMBench'] = """
58
+ ## Creation MMBench Evaluation Results
59
+
60
+ - Creation-MMBench is a multimodal benchmark specifically designed to evaluate the creative capabilities of MLLMs. It consists of **765 test cases**, covering **51 fine-grained tasks** across **4 categories**: *Literary Writing*, *Creative Multimodal Understanding*, *Professional Functionality Writing*, and *Common Functionality Writing*. As an MLLM benchmark, it contains a total of **1001 images** encompassing more than 25 different categories, with some questions incorporating up to 9 images.
61
+ - Creation-MMBench includes carefully crafted **instance-specific criteria** for each test case, enabling assessment of both general response quality and visual-factual alignment in model-generated content. It employs a pair-wise comparison approach, where the model's output is compared with the reference answer (considering both the true answer, input prompt, and visual content) to get the assessment result. The **Dual Evaluation, and GPT-4o judge model** is the evaluation strategy for Creation-MMBench.
62
+ - VFS stands for Visual Factuality Score. The rankings in this leaderboard are arranged in descending order based on the overall reward of each model, with **GPT-4o-1120** providing the **Reference Answer** for comparison, thus serving as the Baseline Model.
63
+ - View More detail on [**Creation-MMBench Official WebPage**](https://open-compass.github.io/Creation-MMBench/)
64
+ """
65
+
66
+ LEADERBOARD_MD['MM-IFEval'] = """
67
+ ## MM-IFEval Evaluation Results
68
+
69
+ - MM-IFEval is a comprehensive multimodal instruction-following benchmark designed to rigorously assess the capabilities of Multimodal Large Language Models (MLLMs). It includes 400 high-quality questions across two levels: 300 compose-level tasks that emphasize output format and content constraints, and 100 perception-level tasks that require precise visual understanding.
70
+ - To ensure accurate evaluation, MM-IFEval employs a hybrid strategy combining rule-based verification with LLM-based judgment models. More details see https://arxiv.org/abs/2504.07957
71
+ - Currently, we use GPT4o (gpt-4o-2024-05-13) when needing an LLM judge model.
72
+ """
73
+
74
+ LEADERBOARD_MD['MMAlignBench'] = """
75
+ ## MMAlignBench Evaluation Results
76
+
77
+ - MM-AlignBench target for evaluating MLLMs' alignment with human preferences. It includes 252 high-quality, human-annotated samples with diverse image types and open-ended questions. Modeled after Arena-style benchmarks, it uses GPT-4o as the judge model and Claude-Sonnet-3 as the reference model.
78
+ - More Details see https://github.com/PhoenixZ810/OmniAlign-V
79
+ """
80
+
81
+ LEADERBOARD_MD['MIA-Bench'] = """
82
+ ## MIA-Bench Evaluation Results
83
+
84
+ - MIA-Bench contains 400 carefully-crafted image-prompt pairs that stress‐test an MLLM’s ability to **follow layered, exacting instructions** in its responses. ([MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal LLMs](https://arxiv.org/abs/2407.01509), [Towards Better Instruction Following Evaluation of Multimodal LLMs](https://machinelearning.apple.com/research/mia-bench))
85
+ - The leaderboard reports the **overall avg score**. Judge Model is **GPT-4o**.
86
+ """
87
+
88
+ LEADERBOARD_MD['MMVet'] = """
89
+ ## MMVet Evaluation Results
90
+
91
+ - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported.
92
+ - No specific prompt template adopted for **ALL VLMs**.
93
+ """
94
+
95
+ LEADERBOARD_MD['WildVision'] = """
96
+ ## WildVision Evaluation Results
97
+
98
+ - WildVision-Bench offers **500 real-world multimodal prompts** curated from the WildVision-Arena crowdsourcing platform to benchmark models **by human preference** in natural conversations.
99
+ - The leaderboard lists reports the **overall reward score**.
100
+ - Judge Model is **GPT-4o**. Reference Model is **Claude-Sonnet-3**.
101
+ """
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio==4.15.0
2
+ numpy>=1.23.4
3
+ pandas>=1.5.3