from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Init: to update with your specific keys class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("task_name1", "metric_name", "First task") task1 = Task("task_name2", "metric_name", "Second task") # Your leaderboard name TITLE = """

🏆 Auto Arena of LLMs

""" # subtitle SUB_TITLE = """

Automating LLM Evaluations with Agent Peer-battles and Committee Discussions

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ This leaderboard is from a completely automated large language model (LLM) evaluation framework by employing various LLM agents in peer-battles and committee discussions. You can find more details from the [project page](https://auto-arena.github.io/) and our [paper](https://arxiv.org/abs/2405.20267). """ # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab. # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings. # """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ``` """ # You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results EVALUATION_QUEUE_TEXT = """ """ CITATION_BUTTON_LABEL = "" CITATION_BUTTON_TEXT = r""" """ CONTACT_TEXT = f""" ## Contact We are open to collaborations! If you don't see your model on the leaderboard and hope to include it, all you need to do is providing an API key and reaching out to us at: zrc.esther@gmail.com, l.bing@alibaba-inc.com, saike.zwx@alibaba-inc.com. We will also regularly maintain the leaderboard by adding mainstream popular models and altering the peer-battle questions to prevent data contamination. """