Spaces:
Running
Running
import gradio as gr | |
from app.utils import add_rank_and_format, filter_models, get_refresh_function, deprecated_get_refresh_function | |
from data.deprecated_model_handler import DeprecatedModelHandler | |
from data.model_handler import ModelHandler | |
METRICS = [ | |
"ndcg_at_1", | |
"ndcg_at_5", | |
"ndcg_at_10", | |
"ndcg_at_100", | |
"recall_at_1", | |
"recall_at_5", | |
"recall_at_10", | |
"recall_at_100", | |
] | |
def main(): | |
# Get new results | |
model_handler = ModelHandler() | |
initial_metric = "ndcg_at_5" | |
model_handler.get_vidore_data(initial_metric) | |
data_benchmark_1 = model_handler.render_df(initial_metric, benchmark_version=1) | |
data_benchmark_1 = add_rank_and_format(data_benchmark_1, benchmark_version=1) | |
data_benchmark_2 = model_handler.render_df(initial_metric, benchmark_version=2) | |
data_benchmark_2 = add_rank_and_format(data_benchmark_2, benchmark_version=2) | |
num_datasets_1 = len(data_benchmark_1.columns) - 3 | |
num_scores_1 = len(data_benchmark_1) * num_datasets_1 | |
num_models_1 = len(data_benchmark_1) | |
num_datasets_2 = len(data_benchmark_2.columns) - 3 | |
num_scores_2 = len(data_benchmark_2) * num_datasets_2 | |
num_models_2 = len(data_benchmark_2) | |
# Get deprecated results | |
deprecated_model_handler = DeprecatedModelHandler() | |
initial_metric = "ndcg_at_5" | |
deprecated_model_handler.get_vidore_data(initial_metric) | |
deprecated_data_benchmark_1 = deprecated_model_handler.render_df(initial_metric, benchmark_version=1) | |
deprecated_data_benchmark_1 = add_rank_and_format(deprecated_data_benchmark_1, benchmark_version=1) | |
deprecated_data_benchmark_2 = deprecated_model_handler.render_df(initial_metric, benchmark_version=2) | |
deprecated_data_benchmark_2 = add_rank_and_format(deprecated_data_benchmark_2, benchmark_version=2) | |
deprecated_num_datasets_1 = len(deprecated_data_benchmark_1.columns) - 3 | |
deprecated_num_scores_1 = len(deprecated_data_benchmark_1) * deprecated_num_datasets_1 | |
deprecated_num_models_1 = len(deprecated_data_benchmark_1) | |
deprecated_num_datasets_2 = len(deprecated_data_benchmark_2.columns) - 3 | |
deprecated_num_scores_2 = len(deprecated_data_benchmark_2) * deprecated_num_datasets_2 | |
deprecated_num_models_2 = len(deprecated_data_benchmark_2) | |
css = """ | |
table > thead { | |
white-space: normal | |
} | |
table { | |
--cell-width-1: 250px | |
} | |
table > tbody > tr > td:nth-child(2) > div { | |
overflow-x: auto | |
} | |
.filter-checkbox-group { | |
max-width: max-content; | |
} | |
#markdown size | |
.markdown { | |
font-size: 1rem; | |
} | |
""" | |
with gr.Blocks(css=css) as block: | |
with gr.Tabs(): | |
with gr.TabItem("ViDoRe V1"): | |
gr.Markdown("# ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍") | |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀") | |
gr.Markdown( | |
""" | |
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab. | |
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models. | |
""" | |
) | |
datasets_columns_1 = list(data_benchmark_1.columns[4:]) | |
with gr.Row(): | |
metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") | |
research_textbox_1 = gr.Textbox( | |
placeholder="🔍 Search Models... [press enter]", | |
label="Filter Models by Name", | |
) | |
column_checkboxes_1 = gr.CheckboxGroup( | |
choices=datasets_columns_1, value=datasets_columns_1, label="Select Columns to Display" | |
) | |
with gr.Row(): | |
datatype_1 = ["number", "markdown"] + ["number"] * (num_datasets_1 + 1) | |
dataframe_1 = gr.Dataframe(data_benchmark_1, datatype=datatype_1, type="pandas") | |
def update_data_1(metric, search_term, selected_columns): | |
model_handler.get_vidore_data(metric) | |
data = model_handler.render_df(metric, benchmark_version=1) | |
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns) | |
data = filter_models(data, search_term) | |
if selected_columns: | |
data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns] | |
return data | |
with gr.Row(): | |
refresh_button_1 = gr.Button("Refresh") | |
refresh_button_1.click( | |
get_refresh_function(model_handler, benchmark_version=1), | |
inputs=[metric_dropdown_1], | |
outputs=dataframe_1, | |
concurrency_limit=20, | |
) | |
# Automatically refresh the dataframe when the dropdown value changes | |
metric_dropdown_1.change( | |
get_refresh_function(model_handler, benchmark_version=1), | |
inputs=[metric_dropdown_1], | |
outputs=dataframe_1, | |
) | |
research_textbox_1.submit( | |
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns), | |
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1], | |
outputs=dataframe_1, | |
) | |
column_checkboxes_1.change( | |
lambda metric, search_term, selected_columns: update_data_1(metric, search_term, selected_columns), | |
inputs=[metric_dropdown_1, research_textbox_1, column_checkboxes_1], | |
outputs=dataframe_1, | |
) | |
gr.Markdown( | |
f""" | |
- **Total Datasets**: {num_datasets_1} | |
- **Total Scores**: {num_scores_1} | |
- **Total Models**: {num_models_1} | |
""" | |
+ r""" | |
Please consider citing: | |
```bibtex | |
@misc{faysse2024colpaliefficientdocumentretrieval, | |
title={ColPali: Efficient Document Retrieval with Vision Language Models}, | |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, | |
year={2024}, | |
eprint={2407.01449}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2407.01449}, | |
} | |
``` | |
""" | |
) | |
with gr.TabItem("ViDoRe V2"): | |
gr.Markdown("# ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍") | |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀") | |
gr.Markdown( | |
""" | |
Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab. | |
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models. | |
""" | |
) | |
datasets_columns_2 = list(data_benchmark_2.columns[4:]) | |
with gr.Row(): | |
metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") | |
research_textbox_2 = gr.Textbox( | |
placeholder="🔍 Search Models... [press enter]", | |
label="Filter Models by Name", | |
) | |
column_checkboxes_2 = gr.CheckboxGroup( | |
choices=datasets_columns_2, value=datasets_columns_2, label="Select Columns to Display" | |
) | |
with gr.Row(): | |
datatype_2 = ["number", "markdown"] + ["number"] * (num_datasets_2 + 1) | |
dataframe_2 = gr.Dataframe(data_benchmark_2, datatype=datatype_2, type="pandas") | |
def update_data_2(metric, search_term, selected_columns): | |
model_handler.get_vidore_data(metric) | |
data = model_handler.render_df(metric, benchmark_version=2) | |
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns) | |
data = filter_models(data, search_term) | |
# data = remove_duplicates(data) # Add this line | |
if selected_columns: | |
data = data[["Rank", "Model", "Model Size (Million Parameters)", "Average"] + selected_columns] | |
return data | |
with gr.Row(): | |
refresh_button_2 = gr.Button("Refresh") | |
refresh_button_2.click( | |
get_refresh_function(model_handler, benchmark_version=2), | |
inputs=[metric_dropdown_2], | |
outputs=dataframe_2, | |
concurrency_limit=20, | |
) | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side. | |
Those numbers are not numbers obtained from the organisations that released those models. | |
""" | |
) | |
# Automatically refresh the dataframe when the dropdown value changes | |
metric_dropdown_2.change( | |
get_refresh_function(model_handler, benchmark_version=2), | |
inputs=[metric_dropdown_2], | |
outputs=dataframe_2, | |
) | |
research_textbox_2.submit( | |
lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns), | |
inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2], | |
outputs=dataframe_2, | |
) | |
column_checkboxes_2.change( | |
lambda metric, search_term, selected_columns: update_data_2(metric, search_term, selected_columns), | |
inputs=[metric_dropdown_2, research_textbox_2, column_checkboxes_2], | |
outputs=dataframe_2, | |
) | |
gr.Markdown( | |
f""" | |
- **Total Datasets**: {num_datasets_2} | |
- **Total Scores**: {num_scores_2} | |
- **Total Models**: {num_models_2} | |
""" | |
+ r""" | |
Please consider citing: | |
```bibtex | |
@misc{faysse2024colpaliefficientdocumentretrieval, | |
title={ColPali: Efficient Document Retrieval with Vision Language Models}, | |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, | |
year={2024}, | |
eprint={2407.01449}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2407.01449}, | |
} | |
@misc{macé2025vidorebenchmarkv2raising, | |
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, | |
author={Quentin Macé and António Loison and Manuel Faysse}, | |
year={2025}, | |
eprint={2505.17166}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2505.17166}, | |
} | |
``` | |
""" | |
) | |
with gr.TabItem("📚 Submit your model"): | |
gr.Markdown("# How to Submit a New Model to the Leaderboard") | |
gr.Markdown( | |
""" | |
To submit a new model to the ViDoRe leaderboard, follow these steps: | |
1. **Evaluate your model**: | |
- Follow the evaluation procedure provided in the [ViDoRe GitHub repository](https://github.com/illuin-tech/vidore-benchmark/) that uses MTEB. | |
2. **Format your submission file**: | |
- Add the generated files to [MTEB results](https://github.com/embeddings-benchmark/results) project. Check the [Colpali results](https://github.com/embeddings-benchmark/results/tree/main/results/vidore__colpali-v1.3/1b5c8929330df1a66de441a9b5409a878f0de5b0) for an example. | |
And you're done! Your model will appear on the leaderboard when you click refresh! Once the space | |
gets rebooted, it will appear on startup. | |
Note: For proper hyperlink redirection, please ensure that your model repository name is in | |
kebab-case, e.g. `my-model-name`. | |
""" | |
) | |
with gr.TabItem("Deprecated ViDoRe V1"): | |
gr.Markdown( | |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the " | |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, " | |
"which is no longer maintained. Results should be computed using the " | |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described " | |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>" | |
) | |
gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>") | |
gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe: The Visual Document Retrieval Benchmark 1 📚🔍") | |
gr.Markdown("### From the paper - ColPali: Efficient Document Retrieval with Vision Language Models 👀") | |
gr.Markdown( | |
""" | |
Visual Document Retrieval Benchmark 1 leaderboard. To submit results, refer to the corresponding tab. | |
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics, tasks and models. | |
""" | |
) | |
deprecated_datasets_columns_1 = list(deprecated_data_benchmark_1.columns[3:]) | |
with gr.Row(): | |
deprecated_metric_dropdown_1 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") | |
deprecated_research_textbox_1 = gr.Textbox( | |
placeholder="🔍 Search Models... [press enter]", | |
label="Filter Models by Name", | |
) | |
deprecated_column_checkboxes_1 = gr.CheckboxGroup( | |
choices=deprecated_datasets_columns_1, value=deprecated_datasets_columns_1, label="Select Columns to Display" | |
) | |
with gr.Row(): | |
deprecated_datatype_1 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_1 + 1) | |
deprecated_dataframe_1 = gr.Dataframe(deprecated_data_benchmark_1, datatype=deprecated_datatype_1, type="pandas") | |
def deprecated_update_data_1(metric, search_term, selected_columns): | |
deprecated_model_handler.get_vidore_data(metric) | |
data = deprecated_model_handler.render_df(metric, benchmark_version=1) | |
data = add_rank_and_format(data, benchmark_version=1, selected_columns=selected_columns) | |
data = filter_models(data, search_term) | |
# data = remove_duplicates(data) # Add this line | |
if selected_columns: | |
data = data[["Rank", "Model", "Average"] + selected_columns] | |
return data | |
with gr.Row(): | |
deprecated_refresh_button_1 = gr.Button("Refresh") | |
deprecated_refresh_button_1.click( | |
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1), | |
inputs=[deprecated_metric_dropdown_1], | |
outputs=deprecated_dataframe_1, | |
concurrency_limit=20, | |
) | |
# Automatically refresh the dataframe when the dropdown value changes | |
deprecated_metric_dropdown_1.change( | |
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=1), | |
inputs=[deprecated_metric_dropdown_1], | |
outputs=deprecated_dataframe_1, | |
) | |
deprecated_research_textbox_1.submit( | |
lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns), | |
inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1], | |
outputs=deprecated_dataframe_1, | |
) | |
deprecated_column_checkboxes_1.change( | |
lambda metric, search_term, selected_columns: deprecated_update_data_1(metric, search_term, selected_columns), | |
inputs=[deprecated_metric_dropdown_1, deprecated_research_textbox_1, deprecated_column_checkboxes_1], | |
outputs=deprecated_dataframe_1, | |
) | |
gr.Markdown( | |
f""" | |
- **Total Datasets**: {deprecated_num_datasets_1} | |
- **Total Scores**: {deprecated_num_scores_1} | |
- **Total Models**: {deprecated_num_models_1} | |
""" | |
+ r""" | |
Please consider citing: | |
```bibtex | |
@misc{faysse2024colpaliefficientdocumentretrieval, | |
title={ColPali: Efficient Document Retrieval with Vision Language Models}, | |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, | |
year={2024}, | |
eprint={2407.01449}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2407.01449}, | |
} | |
``` | |
""" | |
) | |
with gr.TabItem("Deprecated ViDoRe V2"): | |
gr.Markdown( | |
"## <span style='color:red'>Deprecation notice: This leaderboard contains the results computed with the " | |
"[vidore-benchmark](https://github.com/illuin-tech/vidore-benchmark) package, " | |
"which is no longer maintained. Results should be computed using the " | |
"[mteb](https://github.com/embeddings-benchmark/mteb) package as described " | |
"[here](https://github.com/illuin-tech/vidore-benchmark/blob/main/README.md).</span>" | |
) | |
gr.Markdown("## <span style='color:red'>Missing results in the new leaderboard are being added as they are re-computed.</span>") | |
gr.Markdown("# <span style='color:red'>[Deprecated]</span> ViDoRe V2: A new visual Document Retrieval Benchmark 📚🔍") | |
gr.Markdown("### A harder dataset benchmark for visual document retrieval 👀") | |
gr.Markdown( | |
""" | |
Visual Document Retrieval Benchmark 2 leaderboard. To submit results, refer to the corresponding tab. | |
Refer to the [ColPali paper](https://arxiv.org/abs/2407.01449) for details on metrics and models. | |
""" | |
) | |
deprecated_datasets_columns_2 = list(deprecated_data_benchmark_2.columns[3:]) | |
with gr.Row(): | |
deprecated_metric_dropdown_2 = gr.Dropdown(choices=METRICS, value=initial_metric, label="Select Metric") | |
deprecated_research_textbox_2 = gr.Textbox( | |
placeholder="🔍 Search Models... [press enter]", | |
label="Filter Models by Name", | |
) | |
deprecated_column_checkboxes_2 = gr.CheckboxGroup( | |
choices=deprecated_datasets_columns_2, value=deprecated_datasets_columns_2, label="Select Columns to Display" | |
) | |
with gr.Row(): | |
deprecated_datatype_2 = ["number", "markdown"] + ["number"] * (deprecated_num_datasets_2 + 1) | |
deprecated_dataframe_2 = gr.Dataframe(deprecated_data_benchmark_2, datatype=deprecated_datatype_2, type="pandas") | |
def deprecated_update_data_2(metric, search_term, selected_columns): | |
deprecated_model_handler.get_vidore_data(metric) | |
data = deprecated_model_handler.render_df(metric, benchmark_version=2) | |
data = add_rank_and_format(data, benchmark_version=2, selected_columns=selected_columns) | |
data = filter_models(data, search_term) | |
# data = remove_duplicates(data) # Add this line | |
if selected_columns: | |
data = data[["Rank", "Model", "Average"] + selected_columns] | |
return data | |
with gr.Row(): | |
deprecated_refresh_button_2 = gr.Button("Refresh") | |
deprecated_refresh_button_2.click( | |
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2), | |
inputs=[deprecated_metric_dropdown_2], | |
outputs=deprecated_dataframe_2, | |
concurrency_limit=20, | |
) | |
with gr.Row(): | |
gr.Markdown( | |
""" | |
**Note**: For now, all models were evaluated using the vidore-benchmark package and custom retrievers on our side. | |
Those numbers are not numbers obtained from the organisations that released those models. | |
""" | |
) | |
# Automatically refresh the dataframe when the dropdown value changes | |
deprecated_metric_dropdown_2.change( | |
deprecated_get_refresh_function(deprecated_model_handler, benchmark_version=2), | |
inputs=[deprecated_metric_dropdown_2], | |
outputs=deprecated_dataframe_2, | |
) | |
deprecated_research_textbox_2.submit( | |
lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns), | |
inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2], | |
outputs=deprecated_dataframe_2, | |
) | |
deprecated_column_checkboxes_2.change( | |
lambda metric, search_term, selected_columns: deprecated_update_data_2(metric, search_term, selected_columns), | |
inputs=[deprecated_metric_dropdown_2, deprecated_research_textbox_2, deprecated_column_checkboxes_2], | |
outputs=deprecated_dataframe_2, | |
) | |
gr.Markdown( | |
f""" | |
- **Total Datasets**: {deprecated_num_datasets_2} | |
- **Total Scores**: {deprecated_num_scores_2} | |
- **Total Models**: {deprecated_num_models_2} | |
""" | |
+ r""" | |
Please consider citing: | |
```bibtex | |
@misc{faysse2024colpaliefficientdocumentretrieval, | |
title={ColPali: Efficient Document Retrieval with Vision Language Models}, | |
author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, | |
year={2024}, | |
eprint={2407.01449}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2407.01449}, | |
} | |
@misc{macé2025vidorebenchmarkv2raising, | |
title={ViDoRe Benchmark V2: Raising the Bar for Visual Retrieval}, | |
author={Quentin Macé and António Loison and Manuel Faysse}, | |
year={2025}, | |
eprint={2505.17166}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.IR}, | |
url={https://arxiv.org/abs/2505.17166}, | |
} | |
``` | |
""" | |
) | |
block.queue(max_size=10).launch(debug=True) | |
if __name__ == "__main__": | |
main() | |