🏆 Auto Arena of LLMs

from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Init: to update with your specific keys
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard 
    task0 = Task("task_name1", "metric_name", "First task")
    task1 = Task("task_name2", "metric_name", "Second task")


# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">🏆 Auto Arena of LLMs</h1>"""

# subtitle
SUB_TITLE = """<h2 align="center" id="space-title">Automating LLM Evaluations with Agent Peer-battles and Committee Discussions</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
This leaderboard is from a completely automated large language model (LLM) evaluation framework by employing various LLM agents in peer-battles and committee discussions.
You can find more details from the [project page](https://auto-arena.github.io/) and our [paper](https://arxiv.org/abs/2405.20267).
"""

# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.

# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
# """

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""

```

"""

# You can find the detailed numerical results in the results Hugging Face dataset: https://huggingface.co/datasets/SeaLLMs/SeaExam-results

EVALUATION_QUEUE_TEXT = """
"""

CITATION_BUTTON_LABEL = ""
CITATION_BUTTON_TEXT = r"""
"""

CONTACT_TEXT = f"""
## Contact
We are open to collaborations! If you don't see your model on the leaderboard and hope to include it, all you need to do is providing an API key and reaching out to us at: zrc.esther@gmail.com, l.bing@alibaba-inc.com, saike.zwx@alibaba-inc.com. We will also regularly maintain the leaderboard by adding mainstream popular models and altering the peer-battle questions to prevent data contamination.
"""