HugSib commited on
Commit
4c1e130
1 Parent(s): fbaa735

Update Leaderboard (#1)

Browse files

- feat: update leaderboard with .json from HF (228207a2bbcce7854aefb2a76eb40db06e71f267)
- fix: minor before refacto (b931cb1ee94765914a92393c2c734af6bdd8574c)
- refactor : break app.py in different files (187990b32e9c87b0aa2ac6f170e17bbdb02123e1)
- feat : metrics dropdown added to gradio (edb334d3745b23eae9493973ae7f8965bb87b3b0)

Files changed (7) hide show
  1. .gitignore +3 -0
  2. app.py +75 -171
  3. app/__init__.py +1 -0
  4. app/utils.py +31 -0
  5. data/__init__.py +1 -0
  6. data/dataset_handler.py +64 -0
  7. data/model_handler.py +94 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv
2
+ *.json
3
+ *.pyc
app.py CHANGED
@@ -1,182 +1,86 @@
1
- import json
2
- import os
3
-
4
  import gradio as gr
5
- import pandas as pd
6
- from huggingface_hub import HfApi, hf_hub_download
7
- from huggingface_hub.repocard import metadata_load
8
-
9
-
10
- def make_clickable_model(model_name, link=None):
11
- if link is None:
12
- link = "https://huggingface.co/" + model_name
13
- # Remove user from model name
14
- # return (
15
- # f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
16
- # )
17
- return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
18
-
19
-
20
- def add_rank(df):
21
- cols_to_rank = [
22
- col
23
- for col in df.columns
24
- if col
25
- not in [
26
- "Model",
27
- "Model Size (Million Parameters)",
28
- "Memory Usage (GB, fp32)",
29
- "Embedding Dimensions",
30
- "Max Tokens",
31
- ]
32
- ]
33
- if len(cols_to_rank) == 1:
34
- df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
35
- else:
36
- df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
37
- df.sort_values("Average", ascending=False, inplace=True)
38
- df.insert(0, "Rank", list(range(1, len(df) + 1)))
39
- df = df.round(2)
40
- # Fill NaN after averaging
41
- df.fillna("", inplace=True)
42
- return df
43
-
44
-
45
- def get_vidore_data():
46
- api = HfApi()
47
-
48
- # local cache path
49
- model_infos_path = "model_infos.json"
50
- MODEL_INFOS = {}
51
- if os.path.exists(model_infos_path):
52
- with open(model_infos_path) as f:
53
- MODEL_INFOS = json.load(f)
54
-
55
- models = api.list_models(filter="vidore")
56
-
57
- for model in models:
58
- if model.modelId not in MODEL_INFOS:
59
- readme_path = hf_hub_download(model.modelId, filename="README.md")
60
- meta = metadata_load(readme_path)
61
- try:
62
- result_path = hf_hub_download(model.modelId, filename="results.json")
63
-
64
- with open(result_path) as f:
65
- results = json.load(f)
66
- # keep only ndcg_at_5
67
- for dataset in results:
68
- results[dataset] = {key: value for key, value in results[dataset].items() if "ndcg_at_5" in key}
69
-
70
- MODEL_INFOS[model.modelId] = {"metadata": meta, "results": results}
71
- except:
72
- continue
73
-
74
- model_res = {}
75
- df = None
76
- if len(MODEL_INFOS) > 0:
77
- for model in MODEL_INFOS.keys():
78
- res = MODEL_INFOS[model]["results"]
79
- dataset_res = {}
80
- for dataset in res.keys():
81
- if "validation_set" == dataset:
82
- continue
83
- dataset_res[dataset] = res[dataset]["ndcg_at_5"]
84
- model_res[model] = dataset_res
85
-
86
- df = pd.DataFrame(model_res).T
87
-
88
- # add average
89
- # df["average"] = df.mean(axis=1)
90
- # df = df.sort_values(by="average", ascending=False)
91
- # # round to 2 decimals
92
- # df = df.round(2)
93
- return df
94
-
95
-
96
- def add_rank_and_format(df):
97
- df = df.reset_index()
98
- df = df.rename(columns={"index": "Model"})
99
- df = add_rank(df)
100
- df["Model"] = df["Model"].apply(make_clickable_model)
101
- return df
102
-
103
-
104
- # 1. Force headers to wrap
105
- # 2. Force model column (maximum) width
106
- # 3. Prevent model column from overflowing, scroll instead
107
- # 4. Prevent checkbox groups from taking up too much space
108
-
109
- css = """
110
- table > thead {
111
- white-space: normal
112
- }
113
-
114
- table {
115
- --cell-width-1: 250px
116
- }
117
-
118
- table > tbody > tr > td:nth-child(2) > div {
119
- overflow-x: auto
120
- }
121
-
122
- .filter-checkbox-group {
123
- max-width: max-content;
124
- }
125
- """
126
-
127
-
128
- def get_refresh_function():
129
- def _refresh():
130
- data_task_category = get_vidore_data()
131
- return add_rank_and_format(data_task_category)
132
-
133
- return _refresh
134
-
135
-
136
- def get_refresh_overall_function():
137
- return lambda: get_refresh_function()
138
-
139
 
140
- data = get_vidore_data()
141
- data = add_rank_and_format(data)
142
 
143
- NUM_DATASETS = len(data.columns) - 3
144
- NUM_SCORES = len(data) * NUM_DATASETS
145
- NUM_MODELS = len(data)
146
-
147
- with gr.Blocks(css=css) as block:
148
- gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 📚🔍")
149
- gr.Markdown("## From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
150
-
151
- gr.Markdown(
152
- f"""
153
- Visual Document Retrieval Benchmark leaderboard. To submit, refer to the <a href="https://github.com/tonywu71/vidore-benchmark/" target="_blank" style="text-decoration: underline">ViDoRe GitHub repository</a>. Refer to the [ColPali paper](https://arxiv.org/abs/XXXX.XXXXX) for details on metrics, tasks and models.
154
- """
155
- )
156
 
157
- with gr.Row():
158
- datatype = ["number", "markdown"] + ["number"] * (NUM_DATASETS + 1)
159
- dataframe = gr.Dataframe(data, datatype=datatype, type="pandas", height=500)
160
 
161
- with gr.Row():
162
- refresh_button = gr.Button("Refresh")
163
- refresh_button.click(get_refresh_function(), inputs=None, outputs=dataframe, concurrency_limit=20)
 
164
 
165
- gr.Markdown(
166
- f"""
167
- - **Total Datasets**: {NUM_DATASETS}
168
- - **Total Scores**: {NUM_SCORES}
169
- - **Total Models**: {NUM_MODELS}
170
- """
171
- + r"""
172
- Please consider citing:
173
 
174
- ```bibtex
175
- INSERT LATER
176
- ```
177
  """
178
- )
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
 
181
- if __name__ == "__main__":
182
  block.queue(max_size=10).launch(debug=True)
 
 
 
 
 
 
1
+ from data.model_handler import ModelHandler
2
+ from app.utils import add_rank_and_format, get_refresh_function
 
3
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ METRICS = ["ndcg_at_5", "recall_at_1", "recall_at_5", "mrr_at_5"]
 
6
 
7
+ def main():
8
+ model_handler = ModelHandler()
9
+ initial_metric = "ndcg_at_5"
10
+
11
+ data = model_handler.get_vidore_data(initial_metric)
12
+ data = add_rank_and_format(data)
 
 
 
 
 
 
 
13
 
14
+ NUM_DATASETS = len(data.columns) - 3
15
+ NUM_SCORES = len(data) * NUM_DATASETS
16
+ NUM_MODELS = len(data)
17
 
18
+ css = """
19
+ table > thead {
20
+ white-space: normal
21
+ }
22
 
23
+ table {
24
+ --cell-width-1: 250px
25
+ }
26
+
27
+ table > tbody > tr > td:nth-child(2) > div {
28
+ overflow-x: auto
29
+ }
 
30
 
31
+ .filter-checkbox-group {
32
+ max-width: max-content;
33
+ }
34
  """
 
35
 
36
+ with gr.Blocks(css=css) as block:
37
+ gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 📚🔍")
38
+ gr.Markdown("## From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀")
39
+
40
+ gr.Markdown(
41
+ """
42
+ Visual Document Retrieval Benchmark leaderboard. To submit, refer to the <a href="https://github.com/tonywu71/vidore-benchmark/" target="_blank" style="text-decoration: underline">ViDoRe GitHub repository</a>. Refer to the [ColPali paper](https://arxiv.org/abs/XXXX.XXXXX) for details on metrics, tasks and models.
43
+ """
44
+ )
45
+ #all_columns = list(data.columns)
46
+ #default_columns = all_columns
47
+
48
+ with gr.Row():
49
+ metric_dropdown = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric")
50
+ #column_checkboxes = gr.CheckboxGroup(choices=all_columns, value=default_columns, label="Select Columns to Display")
51
+
52
+ with gr.Row():
53
+ datatype = ["number", "markdown"] + ["number"] * (NUM_DATASETS + 1)
54
+ dataframe = gr.Dataframe(data, datatype=datatype, type="pandas")
55
+
56
+ with gr.Row():
57
+ refresh_button = gr.Button("Refresh")
58
+ refresh_button.click(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe, concurrency_limit=20)
59
+
60
+
61
+ # Automatically refresh the dataframe when the dropdown value changes
62
+ metric_dropdown.change(get_refresh_function(), inputs=[metric_dropdown], outputs=dataframe)
63
+ #column_checkboxes.change(get_refresh_function(), inputs=[metric_dropdown, column_checkboxes], outputs=dataframe)
64
+
65
+
66
+ gr.Markdown(
67
+ f"""
68
+ - **Total Datasets**: {NUM_DATASETS}
69
+ - **Total Scores**: {NUM_SCORES}
70
+ - **Total Models**: {NUM_MODELS}
71
+ """
72
+ + r"""
73
+ Please consider citing:
74
+
75
+ ```bibtex
76
+ INSERT LATER
77
+ ```
78
+ """
79
+ )
80
 
 
81
  block.queue(max_size=10).launch(debug=True)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
86
+
app/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
app/utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data.model_handler import ModelHandler
2
+
3
+ def make_clickable_model(model_name, link=None):
4
+ if link is None:
5
+ desanitized_model_name = model_name.replace("_", "/")
6
+
7
+ if '/captioning' in desanitized_model_name:
8
+ desanitized_model_name = desanitized_model_name.replace('/captioning', '')
9
+ if '/ocr' in desanitized_model_name:
10
+ desanitized_model_name = desanitized_model_name.replace('/ocr', '')
11
+
12
+ link = "https://huggingface.co/" + desanitized_model_name
13
+
14
+ return f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
15
+
16
+
17
+ def add_rank_and_format(df):
18
+ df = df.reset_index()
19
+ df = df.rename(columns={"index": "Model"})
20
+ df = ModelHandler.add_rank(df)
21
+ df["Model"] = df["Model"].apply(make_clickable_model)
22
+ return df
23
+
24
+ def get_refresh_function():
25
+ def _refresh(metric):
26
+ model_handler = ModelHandler()
27
+ data_task_category = model_handler.get_vidore_data(metric)
28
+ df = add_rank_and_format(data_task_category)
29
+ return df
30
+
31
+ return _refresh
data/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
data/dataset_handler.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict
2
+ from huggingface_hub import get_collection
3
+
4
+
5
+ def get_datasets_nickname() -> Dict:
6
+ datasets_nickname = {}
7
+
8
+ collection = get_collection("vidore/vidore-benchmark-667173f98e70a1c0fa4db00d")
9
+ collection_items = collection.items
10
+
11
+ for item in collection_items:
12
+ dataset_name = item.item_id
13
+
14
+ if 'arxivqa' in dataset_name:
15
+ datasets_nickname[dataset_name] = 'ArxivQA'
16
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'ArxivQA'
17
+ datasets_nickname[dataset_name + '_captioning'] = 'ArxivQA'
18
+
19
+ elif 'docvqa' in dataset_name:
20
+ datasets_nickname[dataset_name] = 'DocVQA'
21
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'DocVQA'
22
+ datasets_nickname[dataset_name + '_captioning'] = 'DocVQA'
23
+
24
+ elif 'infovqa' in dataset_name:
25
+ datasets_nickname[dataset_name] = 'InfoVQA'
26
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'InfoVQA'
27
+ datasets_nickname[dataset_name + '_captioning'] = 'InfoVQA'
28
+
29
+ elif 'tabfquad' in dataset_name:
30
+ datasets_nickname[dataset_name] = 'TabFQuad'
31
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'TabFQuad'
32
+ datasets_nickname[dataset_name + '_captioning'] = 'TabFQuad'
33
+
34
+ elif 'tatdqa' in dataset_name:
35
+ datasets_nickname[dataset_name] = 'TATDQA'
36
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'TATDQA'
37
+ datasets_nickname[dataset_name + '_captioning'] = 'TATDQA'
38
+
39
+ elif 'shiftproject' in dataset_name:
40
+ datasets_nickname[dataset_name] = 'ShiftProject'
41
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'ShiftProject'
42
+ datasets_nickname[dataset_name + '_captioning'] = 'ShiftProject'
43
+
44
+ elif 'artificial_intelligence' in dataset_name:
45
+ datasets_nickname[dataset_name] = 'Artificial Intelligence'
46
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Artificial Intelligence'
47
+ datasets_nickname[dataset_name + '_captioning'] = 'Artificial Intelligence'
48
+
49
+ elif 'energy' in dataset_name:
50
+ datasets_nickname[dataset_name] = 'Energy'
51
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Energy'
52
+ datasets_nickname[dataset_name + '_captioning'] = 'Energy'
53
+
54
+ elif 'government_reports' in dataset_name:
55
+ datasets_nickname[dataset_name] = 'Government Reports'
56
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Government Reports'
57
+ datasets_nickname[dataset_name + '_captioning'] = 'Government Reports'
58
+
59
+ elif 'healthcare' in dataset_name:
60
+ datasets_nickname[dataset_name] = 'Healthcare'
61
+ datasets_nickname[dataset_name + '_ocr_chunk'] = 'Healthcare'
62
+ datasets_nickname[dataset_name + '_captioning'] = 'Healthcare'
63
+
64
+ return datasets_nickname
data/model_handler.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Dict
4
+ from huggingface_hub import HfApi, hf_hub_download, metadata_load
5
+ import pandas as pd
6
+ from .dataset_handler import get_datasets_nickname
7
+
8
+ class ModelHandler:
9
+ def __init__(self, model_infos_path="model_infos.json"):
10
+ self.api = HfApi()
11
+ self.model_infos_path = model_infos_path
12
+ self.model_infos = self._load_model_infos()
13
+
14
+ def _load_model_infos(self) -> Dict:
15
+ if os.path.exists(self.model_infos_path):
16
+ with open(self.model_infos_path) as f:
17
+ return json.load(f)
18
+ return {}
19
+
20
+ def _save_model_infos(self):
21
+ with open(self.model_infos_path, "w") as f:
22
+ json.dump(self.model_infos, f)
23
+
24
+ def get_vidore_data(self, metric="ndcg_at_5"):
25
+ models = self.api.list_models(filter="vidore")
26
+ repositories = [model.modelId for model in models] # type: ignore
27
+
28
+ datasets_nickname = get_datasets_nickname()
29
+ for repo_id in repositories:
30
+ files = [f for f in self.api.list_repo_files(repo_id) if f.endswith('_metrics.json')]
31
+ if len(files) == 0:
32
+ continue
33
+ else:
34
+ for file in files:
35
+ model_name = file.split('_metrics.json')[0]
36
+
37
+ if model_name not in self.model_infos:
38
+ readme_path = hf_hub_download(repo_id, filename="README.md")
39
+ meta = metadata_load(readme_path)
40
+ try:
41
+ result_path = hf_hub_download(repo_id, filename=file)
42
+
43
+ with open(result_path) as f:
44
+ results = json.load(f)
45
+
46
+ for dataset in results:
47
+ results[dataset] = {key: value for key, value in results[dataset].items()}
48
+
49
+ self.model_infos[model_name] = {"meta": meta, "results": results}
50
+ except Exception as e:
51
+ print(f"Error loading {model_name} - {e}")
52
+ continue
53
+
54
+ #self._save_model_infos()
55
+
56
+ model_res = {}
57
+ if len(self.model_infos) > 0:
58
+ for model in self.model_infos.keys():
59
+ res = self.model_infos[model]["results"]
60
+ dataset_res = {}
61
+ for dataset in res.keys():
62
+ if "validation_set" == dataset:
63
+ continue
64
+ dataset_res[datasets_nickname[dataset]] = res[dataset][metric]
65
+ model_res[model] = dataset_res
66
+
67
+ df = pd.DataFrame(model_res).T
68
+ return df
69
+ return pd.DataFrame()
70
+
71
+ @staticmethod
72
+ def add_rank(df):
73
+ cols_to_rank = [
74
+ col
75
+ for col in df.columns
76
+ if col
77
+ not in [
78
+ "Model",
79
+ "Model Size (Million Parameters)",
80
+ "Memory Usage (GB, fp32)",
81
+ "Embedding Dimensions",
82
+ "Max Tokens",
83
+ ]
84
+ ]
85
+ if len(cols_to_rank) == 1:
86
+ df.sort_values(cols_to_rank[0], ascending=False, inplace=True)
87
+ else:
88
+ df.insert(len(df.columns) - len(cols_to_rank), "Average", df[cols_to_rank].mean(axis=1, skipna=False))
89
+ df.sort_values("Average", ascending=False, inplace=True)
90
+ df.insert(0, "Rank", list(range(1, len(df) + 1)))
91
+ df = df.round(2)
92
+ # Fill NaN after averaging
93
+ df.fillna("", inplace=True)
94
+ return df