Commit
·
5c2a615
1
Parent(s):
064c980
Add NYT Connections benchmark
Browse files- app.py +13 -7
- nyt_connections.jsonl +7 -0
app.py
CHANGED
@@ -244,6 +244,11 @@ with gr.Blocks() as demo:
|
|
244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
245 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
246 |
)
|
|
|
|
|
|
|
|
|
|
|
247 |
with gr.Tab("🟡 GAIA") as gaia_tab:
|
248 |
gaia_plot: gr.Plot = gr.Plot()
|
249 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
@@ -377,11 +382,6 @@ with gr.Blocks() as demo:
|
|
377 |
hhem_markdown: gr.Markdown = gr.Markdown(
|
378 |
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
379 |
)
|
380 |
-
with gr.Tab("NYT Connections", visible=False):
|
381 |
-
nyt_connections_exam_plot: gr.Plot = gr.Plot()
|
382 |
-
nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
|
383 |
-
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
384 |
-
)
|
385 |
with gr.Tab("USACO", visible=False):
|
386 |
usaco_plot: gr.Plot = gr.Plot()
|
387 |
usaco_markdown: gr.Markdown = gr.Markdown(
|
@@ -465,7 +465,7 @@ with gr.Blocks() as demo:
|
|
465 |
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
466 |
gr.State(
|
467 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
468 |
-
gr.State(date(2024,
|
469 |
gr.State(0), gr.State(100),
|
470 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
471 |
outputs=arc_agi_public_eval_plot)
|
@@ -475,7 +475,7 @@ with gr.Blocks() as demo:
|
|
475 |
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
476 |
gr.State(
|
477 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
478 |
-
gr.State(date(2024,
|
479 |
gr.State(0), gr.State(100),
|
480 |
gr.State({"MTurkers": 77})],
|
481 |
outputs=arc_agi_semi_private_eval_plot)
|
@@ -639,6 +639,12 @@ with gr.Blocks() as demo:
|
|
639 |
gr.State(22.75), gr.State(100),
|
640 |
gr.State({"Human experts": 77.75})],
|
641 |
outputs=emma_plot)
|
|
|
|
|
|
|
|
|
|
|
|
|
642 |
|
643 |
if __name__ == "__main__":
|
644 |
demo.launch()
|
|
|
244 |
planbench_markdown: gr.Markdown = gr.Markdown(
|
245 |
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
|
246 |
)
|
247 |
+
with gr.Tab("🟡 NYT Connections") as nyt_connections_tab:
|
248 |
+
nyt_connections_plot: gr.Plot = gr.Plot()
|
249 |
+
nyt_connections_markdown: gr.Markdown = gr.Markdown(
|
250 |
+
value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
|
251 |
+
)
|
252 |
with gr.Tab("🟡 GAIA") as gaia_tab:
|
253 |
gaia_plot: gr.Plot = gr.Plot()
|
254 |
gaia_markdown: gr.Markdown = gr.Markdown(
|
|
|
382 |
hhem_markdown: gr.Markdown = gr.Markdown(
|
383 |
value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
|
384 |
)
|
|
|
|
|
|
|
|
|
|
|
385 |
with gr.Tab("USACO", visible=False):
|
386 |
usaco_plot: gr.Plot = gr.Plot()
|
387 |
usaco_markdown: gr.Markdown = gr.Markdown(
|
|
|
465 |
"ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
466 |
gr.State(
|
467 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
468 |
+
gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
|
469 |
gr.State(0), gr.State(100),
|
470 |
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
|
471 |
outputs=arc_agi_public_eval_plot)
|
|
|
475 |
"ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
|
476 |
gr.State(
|
477 |
"\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
|
478 |
+
gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
|
479 |
gr.State(0), gr.State(100),
|
480 |
gr.State({"MTurkers": 77})],
|
481 |
outputs=arc_agi_semi_private_eval_plot)
|
|
|
639 |
gr.State(22.75), gr.State(100),
|
640 |
gr.State({"Human experts": 77.75})],
|
641 |
outputs=emma_plot)
|
642 |
+
nyt_connections_tab.select(fn=create_simple_plot,
|
643 |
+
inputs=[gr.State("nyt_connections.jsonl"),
|
644 |
+
gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"),
|
645 |
+
gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"),
|
646 |
+
gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))],
|
647 |
+
outputs=nyt_connections_plot)
|
648 |
|
649 |
if __name__ == "__main__":
|
650 |
demo.launch()
|
nyt_connections.jsonl
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model": "o1-2024-12-17", "score": 60.0}
|
2 |
+
{"model": "o3-mini-2025-01-31", "score": 42.8}
|
3 |
+
{"model": "deepseek-r1", "score": 28.7}
|
4 |
+
{"model": "o1-mini-2024-09-12", "score": 18.8}
|
5 |
+
{"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 15.2}
|
6 |
+
{"model": "qwen2.5-max", "score": 13.8}
|
7 |
+
{"model": "llama-3.1-405b-instruct", "score": 13.2}
|