kaizuberbuehler commited on
Commit
5c2a615
·
1 Parent(s): 064c980

Add NYT Connections benchmark

Browse files
Files changed (2) hide show
  1. app.py +13 -7
  2. nyt_connections.jsonl +7 -0
app.py CHANGED
@@ -244,6 +244,11 @@ with gr.Blocks() as demo:
244
  planbench_markdown: gr.Markdown = gr.Markdown(
245
  value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
246
  )
 
 
 
 
 
247
  with gr.Tab("🟡 GAIA") as gaia_tab:
248
  gaia_plot: gr.Plot = gr.Plot()
249
  gaia_markdown: gr.Markdown = gr.Markdown(
@@ -377,11 +382,6 @@ with gr.Blocks() as demo:
377
  hhem_markdown: gr.Markdown = gr.Markdown(
378
  value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
379
  )
380
- with gr.Tab("NYT Connections", visible=False):
381
- nyt_connections_exam_plot: gr.Plot = gr.Plot()
382
- nyt_connections_exam_markdown: gr.Markdown = gr.Markdown(
383
- value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
384
- )
385
  with gr.Tab("USACO", visible=False):
386
  usaco_plot: gr.Plot = gr.Plot()
387
  usaco_markdown: gr.Markdown = gr.Markdown(
@@ -465,7 +465,7 @@ with gr.Blocks() as demo:
465
  "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
466
  gr.State(
467
  "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
468
- gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
469
  gr.State(0), gr.State(100),
470
  gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
471
  outputs=arc_agi_public_eval_plot)
@@ -475,7 +475,7 @@ with gr.Blocks() as demo:
475
  "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
476
  gr.State(
477
  "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
478
- gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
479
  gr.State(0), gr.State(100),
480
  gr.State({"MTurkers": 77})],
481
  outputs=arc_agi_semi_private_eval_plot)
@@ -639,6 +639,12 @@ with gr.Blocks() as demo:
639
  gr.State(22.75), gr.State(100),
640
  gr.State({"Human experts": 77.75})],
641
  outputs=emma_plot)
 
 
 
 
 
 
642
 
643
  if __name__ == "__main__":
644
  demo.launch()
 
244
  planbench_markdown: gr.Markdown = gr.Markdown(
245
  value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
246
  )
247
+ with gr.Tab("🟡 NYT Connections") as nyt_connections_tab:
248
+ nyt_connections_plot: gr.Plot = gr.Plot()
249
+ nyt_connections_markdown: gr.Markdown = gr.Markdown(
250
+ value="""Source: [NYT Connections Leaderboard](https://github.com/lechmazur/nyt-connections)"""
251
+ )
252
  with gr.Tab("🟡 GAIA") as gaia_tab:
253
  gaia_plot: gr.Plot = gr.Plot()
254
  gaia_markdown: gr.Markdown = gr.Markdown(
 
382
  hhem_markdown: gr.Markdown = gr.Markdown(
383
  value="""Source: [Vectara Hallucination Leaderboard](https://github.com/vectara/hallucination-leaderboard)"""
384
  )
 
 
 
 
 
385
  with gr.Tab("USACO", visible=False):
386
  usaco_plot: gr.Plot = gr.Plot()
387
  usaco_markdown: gr.Markdown = gr.Markdown(
 
465
  "ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
466
  gr.State(
467
  "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
468
+ gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
469
  gr.State(0), gr.State(100),
470
  gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
471
  outputs=arc_agi_public_eval_plot)
 
475
  "ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
476
  gr.State(
477
  "\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
478
+ gr.State(date(2024, 6, 20)), gr.State(date(2025, 1, 1)),
479
  gr.State(0), gr.State(100),
480
  gr.State({"MTurkers": 77})],
481
  outputs=arc_agi_semi_private_eval_plot)
 
639
  gr.State(22.75), gr.State(100),
640
  gr.State({"Human experts": 77.75})],
641
  outputs=emma_plot)
642
+ nyt_connections_tab.select(fn=create_simple_plot,
643
+ inputs=[gr.State("nyt_connections.jsonl"),
644
+ gr.State("NYT Connections (Extended Version, Newest 100 Puzzles) Score"),
645
+ gr.State("\"NYT Connections puzzles [...] To increase difficulty, Extended Connections adds up to four extra trick words to each puzzle.\" (Mazur, 2025)"),
646
+ gr.State(date(2024, 7, 23)), gr.State(date(2025, 2, 1))],
647
+ outputs=nyt_connections_plot)
648
 
649
  if __name__ == "__main__":
650
  demo.launch()
nyt_connections.jsonl ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {"model": "o1-2024-12-17", "score": 60.0}
2
+ {"model": "o3-mini-2025-01-31", "score": 42.8}
3
+ {"model": "deepseek-r1", "score": 28.7}
4
+ {"model": "o1-mini-2024-09-12", "score": 18.8}
5
+ {"model": "gemini-2.0-flash-thinking-exp-01-21", "score": 15.2}
6
+ {"model": "qwen2.5-max", "score": 13.8}
7
+ {"model": "llama-3.1-405b-instruct", "score": 13.2}