| { | |
| "arc_easy": { | |
| "benchmark": "arc_easy", | |
| "metric": "accuracy", | |
| "display_name": "ARC-E", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" | |
| }, | |
| "arc_challenge": { | |
| "benchmark": "arc_challenge", | |
| "metric": "accuracy", | |
| "display_name": "ARC-C", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/arc" | |
| }, | |
| "drop": { | |
| "benchmark": "drop", | |
| "metric": "mean", | |
| "display_name": "DROP", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/drop" | |
| }, | |
| "winogrande": { | |
| "benchmark": "winogrande", | |
| "metric": "accuracy", | |
| "display_name": "WinoGrande", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/winogrande" | |
| }, | |
| "gsm8k": { | |
| "benchmark": "gsm8k", | |
| "metric": "accuracy", | |
| "display_name": "GSM8K", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gsm8k" | |
| }, | |
| "hellaswag": { | |
| "benchmark": "hellaswag", | |
| "metric": "accuracy", | |
| "display_name": "HellaSwag", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/hellaswag" | |
| }, | |
| "humaneval": { | |
| "benchmark": "humaneval", | |
| "metric": "mean", | |
| "display_name": "HumanEval", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/humaneval" | |
| }, | |
| "ifeval": { | |
| "benchmark": "ifeval", | |
| "metric": "final_acc", | |
| "display_name": "IFEval", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/ifeval" | |
| }, | |
| "math": { | |
| "benchmark": "math", | |
| "metric": "accuracy", | |
| "display_name": "MATH", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mathematics" | |
| }, | |
| "mmlu": { | |
| "benchmark": "mmlu", | |
| "metric": "accuracy", | |
| "display_name": "MMLU", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu" | |
| }, | |
| "mmlu_pro": { | |
| "benchmark": "mmlu_pro", | |
| "metric": "accuracy", | |
| "display_name": "MMLU-Pro", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmlu_pro" | |
| }, | |
| "gpqa_diamond": { | |
| "benchmark": "gpqa_diamond", | |
| "metric": "accuracy", | |
| "display_name": "GPQA-D", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gpqa" | |
| }, | |
| "mmmu_multiple_choice": { | |
| "benchmark": "mmmu_multiple_choice", | |
| "metric": "accuracy", | |
| "display_name": "MMMU-MC", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" | |
| }, | |
| "mmmu_open": { | |
| "benchmark": "mmmu_open", | |
| "metric": "accuracy", | |
| "display_name": "MMMU-OE", | |
| "type": "base", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/mmmu" | |
| }, | |
| "gaia": { | |
| "benchmark": "gaia", | |
| "metric": "accuracy", | |
| "display_name": "GAIA", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gaia" | |
| }, | |
| "gdm_intercode_ctf": { | |
| "benchmark": "gdm_intercode_ctf", | |
| "metric": "accuracy", | |
| "display_name": "InterCode-CTF", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/intercode_ctf" | |
| }, | |
| "gdm_in_house_ctf": { | |
| "benchmark": "gdm_in_house_ctf", | |
| "metric": "accuracy", | |
| "display_name": "In-House-CTF", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/gdm_capabilities/in_house_ctf" | |
| }, | |
| "agentharm": { | |
| "benchmark": "agentharm", | |
| "metric": "avg_score", | |
| "display_name": "AgentHarm", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" | |
| }, | |
| "agentharm_benign": { | |
| "benchmark": "agentharm_benign", | |
| "metric": "avg_score", | |
| "display_name": "AgentHarm-Benign", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/agentharm" | |
| }, | |
| "swe_bench": { | |
| "benchmark": "swe_bench", | |
| "metric": "mean", | |
| "display_name": "SWE-Bench", | |
| "type": "agentic", | |
| "source": "https://github.com/UKGovernmentBEIS/inspect_evals/tree/main/src/inspect_evals/swe_bench" | |
| } | |
| } |