{"model": "h2oGPTe Agent v1.6.8 (claude-3-5-sonnet)", "score": 65.12} {"model": "Langfun Agent v2.0 (claude-3-5-sonnet, gemini-1.5-pro-002)", "score": 49.33} {"model": "barcelona v0.1 (claude sonnet 3.5)", "score": 46.18} {"model": "omne v0.1 (o1-preview, gpt-4o)", "score": 40.53} {"model": "Trase Agent v0.2 (fine-tuned gemini, gpt-4o, o1-preview)", "score": 39.53} {"model": "Multi Agent", "score": 38.87} {"model": "DynaSaur (gpt-4o)", "score": 38.21} {"model": "magentic-1 (o1)", "score": 38} {"model": "Trase Agent v0.1 (fine-tuned gpt-4o)", "score": 35.55} {"model": "sibyl system v0.2 (gpt-4o)", "score": 34.55} {"model": "HuggingFaceAgents (gpt-4o)", "score": 33.33} {"model": "tapeagent v0.2", "score": 33.22} {"model": "little_potato (yanzw gpt-4o)", "score": 32.89} {"model": "Multi-Agent Experiment v0.1 (gpt-4-turbo)", "score": 32.33} {"model": "magentic-1", "score": 32.33} {"model": "das_agent v0.2", "score": 32.33} {"model": "cola_v0.4", "score": 31.89} {"model": "modified hugging face agents + gpt-4o", "score": 31.23} {"model": "das_agent", "score": 31} {"model": "das_agent v0.3", "score": 30.33} {"model": "cola_v0.3", "score": 30.23} {"model": "gpt-4o-2024-08-06", "score": 29} {"model": "replicated hugging face agents + gpt-4o", "score": 29} {"model": "tapeagent v0.1", "score": 27.57} {"model": "das_agent v0.4 mini (fixed)", "score": 26.91} {"model": "sibyl system v0.2 (gpt-4o-2024-08-06)", "score": 26.58} {"model": "das_agent v0.4 mini", "score": 25.91} {"model": "mmac v1.1 (gpt4v gemini 1.5)", "score": 25.91} {"model": "modified sibyl system", "score": 25.91} {"model": "maac_v1", "score": 25.58} {"model": "uk ai safety institute internal (gpt-4-turbo)", "score": 25} {"model": "FRIDAY (gpt-4-turbo)", "score": 24.25} {"model": "cola_abl", "score": 23.26} {"model": "replicated hugging face agents + gpt-4o mini", "score": 22.67} {"model": "tapeagent v0.2 mini", "score": 21.93} {"model": "friday_without_learning (os-copilot gpt-4-turbo)", "score": 21.59} {"model": "ceylon", "score": 17.06} {"model": "tapeagent v0.1 mini", "score": 16.61} {"model": "dip (gpt-4-turbo)", "score": 15.95} {"model": "sibyl system v0.2 (gpt-4o-mini-2024-07-18)", "score": 15.61} {"model": "cola_v0.2", "score": 15.28} {"model": "chamomile", "score": 14.67} {"model": "clarity v1", "score": 14.05} {"model": "warm-up act (gpt-4-turbo)", "score": 12.96} {"model": "frc v5", "score": 12} {"model": "cola_v0.1", "score": 10.96} {"model": "somedayv1.2", "score": 10.3} {"model": "somedayv1.1", "score": 9.97} {"model": "frc v4", "score": 9.33} {"model": "stealth3", "score": 9.3} {"model": "stealth2", "score": 8.97} {"model": "someday1", "score": 8.97} {"model": "frc v3", "score": 8.67} {"model": "stealth", "score": 8.64} {"model": "gpt-4-turbo", "score": 6.67} {"model": "someday", "score": 6.31} {"model": "AutoGPT4 (gpt-4)", "score": 5} {"model": "gpt-4o-mini-2024-07-18", "score": 4.65} {"model": "gpt-4", "score": 4} {"model": "gpt-3.5-turbo", "score": 2.67} {"model": "alphaagent v0.1 (gpt-4o)", "score": 2.33}