{"model": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "score": 65.12} | |
{"model": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "score": 49.33} | |
{"model": "barcelona v0.1 (claude sonnet 3.5)", "score": 46.18} | |
{"model": "omne v0.1 (o1-preview, gpt-4o)", "score": 40.53} | |
{"model": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "score": 39.53} | |
{"model": "Multi Agent", "score": 38.87} | |
{"model": "DynaSaur (gpt-4o)", "score": 38.21} | |
{"model": "magentic-1 (o1)", "score": 38} | |
{"model": "Trase Agent v0.1 (fine-tuned gpt-4o)", "score": 35.55} | |
{"model": "sibyl system v0.2 (gpt-4o)", "score": 34.55} | |
{"model": "HuggingFaceAgents (gpt-4o)", "score": 33.33} | |
{"model": "tapeagent v0.2", "score": 33.22} | |
{"model": "little_potato (yanzw gpt-4o)", "score": 32.89} | |
{"model": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "score": 32.33} | |
{"model": "magentic-1", "score": 32.33} | |
{"model": "das_agent v0.2", "score": 32.33} | |
{"model": "cola_v0.4", "score": 31.89} | |
{"model": "modified hugging face agents + gpt-4o", "score": 31.23} | |
{"model": "das_agent", "score": 31} | |
{"model": "das_agent v0.3", "score": 30.33} | |
{"model": "cola_v0.3", "score": 30.23} | |
{"model": "gpt-4o-2024-08-06", "score": 29} | |
{"model": "replicated hugging face agents + gpt-4o", "score": 29} | |
{"model": "tapeagent v0.1", "score": 27.57} | |
{"model": "das_agent v0.4 mini (fixed)", "score": 26.91} | |
{"model": "sibyl system v0.2 (gpt-4o-2024-08-06)", "score": 26.58} | |
{"model": "das_agent v0.4 mini", "score": 25.91} | |
{"model": "mmac v1.1 (gpt4v gemini 1.5)", "score": 25.91} | |
{"model": "modified sibyl system", "score": 25.91} | |
{"model": "maac_v1", "score": 25.58} | |
{"model": "uk ai safety institute internal (gpt-4-turbo)", "score": 25} | |
{"model": "FRIDAY (gpt-4-turbo)", "score": 24.25} | |
{"model": "cola_abl", "score": 23.26} | |
{"model": "replicated hugging face agents + gpt-4o mini", "score": 22.67} | |
{"model": "tapeagent v0.2 mini", "score": 21.93} | |
{"model": "friday_without_learning (os-copilot gpt-4-turbo)", "score": 21.59} | |
{"model": "ceylon", "score": 17.06} | |
{"model": "tapeagent v0.1 mini", "score": 16.61} | |
{"model": "dip (gpt-4-turbo)", "score": 15.95} | |
{"model": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "score": 15.61} | |
{"model": "cola_v0.2", "score": 15.28} | |
{"model": "chamomile", "score": 14.67} | |
{"model": "clarity v1", "score": 14.05} | |
{"model": "warm-up act (gpt-4-turbo)", "score": 12.96} | |
{"model": "frc v5", "score": 12} | |
{"model": "cola_v0.1", "score": 10.96} | |
{"model": "somedayv1.2", "score": 10.3} | |
{"model": "somedayv1.1", "score": 9.97} | |
{"model": "frc v4", "score": 9.33} | |
{"model": "stealth3", "score": 9.3} | |
{"model": "stealth2", "score": 8.97} | |
{"model": "someday1", "score": 8.97} | |
{"model": "frc v3", "score": 8.67} | |
{"model": "stealth", "score": 8.64} | |
{"model": "gpt-4-turbo", "score": 6.67} | |
{"model": "someday", "score": 6.31} | |
{"model": "AutoGPT4 (gpt-4)", "score": 5} | |
{"model": "gpt-4o-mini-2024-07-18", "score": 4.65} | |
{"model": "gpt-4", "score": 4} | |
{"model": "gpt-3.5-turbo", "score": 2.67} | |
{"model": "alphaagent v0.1 (gpt-4o)", "score": 2.33} |