# CONSTANTS-URL URL = "http://opencompass.openxlab.space/assets/OpenVLM_Subjective_Leaderboard.json" VLMEVALKIT_README = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/README.md' # CONSTANTS-CITATION CITATION_BUTTON_TEXT = r"""@inproceedings{duan2024vlmevalkit, title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models}, author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others}, booktitle={Proceedings of the 32nd ACM International Conference on Multimedia}, pages={11198--11201}, year={2024} }""" CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" # CONSTANTS-TEXT LEADERBORAD_INTRODUCTION = """# OpenVLM Subjective Leaderboard ### Welcome to the OpenVLM Subjective Leaderboard! On this leaderboard we share the evaluation results of VLMs obtained by the OpenSource Framework: ### [*VLMEvalKit*: A Toolkit for Evaluating Large Vision-Language Models](https://github.com/open-compass/VLMEvalKit) 🏆 ### Currently, OpenVLM Subjective Leaderboard covers {} different VLMs (including GPT4o, Gemini, Qwen2.5-VL, InternVL2.5 etc.) and {} different multi-modal benchmarks. This leaderboard was last updated: {}. OpenVLM Subjective Leaderboard only includes open-source VLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) to support your VLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [opencompass, fangxinyu, dingshengyuan]@pjlab.org.cn. """ # CONSTANTS-FIELDS META_FIELDS = [ 'Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Eval Date' ] MAIN_FIELDS = [ 'Creation_MMBench', 'MIA-Bench', 'MM-IFEval', 'MMAlignBench', 'MMVet', 'WildVision' ] DEFAULT_BENCH = [ 'Creation_MMBench', 'MIA-Bench', 'MM-IFEval', 'MMAlignBench', 'MMVet', 'WildVision' ] MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown'] MODEL_TYPE = ['API', 'OpenSource'] # The README file for each benchmark LEADERBOARD_MD = {} LEADERBOARD_MD['MAIN'] = f""" ## Main Evaluation Results - Metrics: - Avg Score: The average score on all VLM Benchmarks (normalized to 0 - 100, the higher the better). - Avg Rank: The average rank on all VLM Benchmarks (the lower the better). - Avg Score & Rank are calculated based on selected benchmark. **When results for some selected benchmarks are missing, Avg Score / Rank will be None!!!** - Metrics For each dataset - Creation-MMBench: Reward Score/Visual Factuality Score - MMAlignBench, WildVision: Reward Score - MIA-Bench, MM-IFEval, MMVet: Overall Score - By default, we present the overall evaluation results based on {len(DEFAULT_BENCH)} VLM benchmarks, sorted by the descending order of Avg Score. - The following datasets are included in the main results: {', '.join(DEFAULT_BENCH)}. - Detailed evaluation results for each dataset (included or not included in main) are provided in the consequent tabs. """ LEADERBOARD_MD['Creation_MMBench'] = """ ## Creation MMBench Evaluation Results - Creation-MMBench is a multimodal benchmark specifically designed to evaluate the creative capabilities of MLLMs. It consists of **765 test cases**, covering **51 fine-grained tasks** across **4 categories**: *Literary Writing*, *Creative Multimodal Understanding*, *Professional Functionality Writing*, and *Common Functionality Writing*. As an MLLM benchmark, it contains a total of **1001 images** encompassing more than 25 different categories, with some questions incorporating up to 9 images. - Creation-MMBench includes carefully crafted **instance-specific criteria** for each test case, enabling assessment of both general response quality and visual-factual alignment in model-generated content. It employs a pair-wise comparison approach, where the model's output is compared with the reference answer (considering both the true answer, input prompt, and visual content) to get the assessment result. The **Dual Evaluation, and GPT-4o judge model** is the evaluation strategy for Creation-MMBench. - VFS stands for Visual Factuality Score. The rankings in this leaderboard are arranged in descending order based on the overall reward of each model, with **GPT-4o-1120** providing the **Reference Answer** for comparison, thus serving as the Baseline Model. - View More detail on [**Creation-MMBench Official WebPage**](https://open-compass.github.io/Creation-MMBench/) """ LEADERBOARD_MD['MM-IFEval'] = """ ## MM-IFEval Evaluation Results - MM-IFEval is a comprehensive multimodal instruction-following benchmark designed to rigorously assess the capabilities of Multimodal Large Language Models (MLLMs). It includes 400 high-quality questions across two levels: 300 compose-level tasks that emphasize output format and content constraints, and 100 perception-level tasks that require precise visual understanding. - To ensure accurate evaluation, MM-IFEval employs a hybrid strategy combining rule-based verification with LLM-based judgment models. More details see https://arxiv.org/abs/2504.07957 - Currently, we use GPT4o (gpt-4o-2024-05-13) when needing an LLM judge model. """ LEADERBOARD_MD['MMAlignBench'] = """ ## MMAlignBench Evaluation Results - MM-AlignBench target for evaluating MLLMs' alignment with human preferences. It includes 252 high-quality, human-annotated samples with diverse image types and open-ended questions. Modeled after Arena-style benchmarks, it uses GPT-4o as the judge model and Claude-Sonnet-3 as the reference model. - More Details see https://github.com/PhoenixZ810/OmniAlign-V """ LEADERBOARD_MD['MIA-Bench'] = """ ## MIA-Bench Evaluation Results - MIA-Bench contains 400 carefully-crafted image-prompt pairs that stress‐test an MLLM’s ability to **follow layered, exacting instructions** in its responses. ([MIA-Bench: Towards Better Instruction Following Evaluation of Multimodal LLMs](https://arxiv.org/abs/2407.01509), [Towards Better Instruction Following Evaluation of Multimodal LLMs](https://machinelearning.apple.com/research/mia-bench)) - The leaderboard reports the **overall avg score**. Judge Model is **GPT-4o**. """ LEADERBOARD_MD['MMVet'] = """ ## MMVet Evaluation Results - In MMVet Evaluation, we use GPT-4-Turbo (gpt-4-1106-preview) as the judge LLM to assign scores to the VLM outputs. We only perform the evaluation once due to the limited variance among results of multiple evaluation pass originally reported. - No specific prompt template adopted for **ALL VLMs**. """ LEADERBOARD_MD['WildVision'] = """ ## WildVision Evaluation Results - WildVision-Bench offers **500 real-world multimodal prompts** curated from the WildVision-Arena crowdsourcing platform to benchmark models **by human preference** in natural conversations. - The leaderboard lists reports the **overall reward score**. - Judge Model is **GPT-4o**. Reference Model is **Claude-Sonnet-3**. """