benbogin commited on
Commit
507ce38
·
1 Parent(s): 6633c28

leaderboard

Browse files
Files changed (7) hide show
  1. .gitignore +2 -0
  2. README.md +16 -8
  3. ZeroEval-main/result_dirs/leaderboard.json +74 -0
  4. _header.md +6 -0
  5. app.py +114 -0
  6. constants.py +217 -0
  7. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.pyc
2
+ *.DS_Store
README.md CHANGED
@@ -1,12 +1,20 @@
1
  ---
2
- title: Super Leaderboard
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 5.15.0
8
  app_file: app.py
9
- pinned: false
 
 
 
 
 
 
 
 
 
 
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: SUPER Leaderboard
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.19.2
8
  app_file: app.py
9
+ pinned: true
10
+ fullWidth: true
11
+ hf_oauth: true
12
+ api: false
13
+ tags:
14
+ - leaderboard
15
+ datasets:
16
+ - https://huggingface.co/datasets/allenai/super
17
+ models:
18
+ - meta-llama/Meta-Llama-3-70B-Instruct
19
+ - mistralai/Mixtral-8x22B-Instruct-v0.1
20
  ---
 
 
ZeroEval-main/result_dirs/leaderboard.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "Agent": "SWE-Agent",
4
+ "Base model": "gpt-4o-2024-08-06",
5
+ "Expert (Accuracy)": "16.3",
6
+ "Expert (Landmarks)": "36.8",
7
+ "Masked (Accuracy)": "46.1",
8
+ "Masked (Landmarks)": "74.9"
9
+ },
10
+ {
11
+ "Agent": "React",
12
+ "Base model": "gpt-4o-2024-08-06",
13
+ "Expert (Accuracy)": "12.2",
14
+ "Expert (Landmarks)": "33.6",
15
+ "Masked (Accuracy)": "37.0",
16
+ "Masked (Landmarks)": "65.7"
17
+ },
18
+ {
19
+ "Agent": "React-Super",
20
+ "Base model": "gpt-4o-2024-08-06",
21
+ "Expert (Accuracy)": "14.4",
22
+ "Expert (Landmarks)": "42.6",
23
+ "Masked (Accuracy)": "41.6",
24
+ "Masked (Landmarks)": "72.5"
25
+ },
26
+ {
27
+ "Agent": "SWE-Agent",
28
+ "Base model": "gpt-4o-mini-2024-07-18",
29
+ "Expert (Accuracy)": "3.3",
30
+ "Expert (Landmarks)": "16.1",
31
+ "Masked (Accuracy)": "27.0",
32
+ "Masked (Landmarks)": "51.8"
33
+ },
34
+ {
35
+ "Agent": "React-Super",
36
+ "Base model": "gpt-4o-mini-2024-07-18",
37
+ "Expert (Accuracy)": "5.6",
38
+ "Expert (Landmarks)": "20.6",
39
+ "Masked (Accuracy)": "31.5",
40
+ "Masked (Landmarks)": "58.3"
41
+ },
42
+ {
43
+ "Agent": "SWE-Agent",
44
+ "Base model": "Llama 3.1 70B",
45
+ "Expert (Accuracy)": "5.6",
46
+ "Expert (Landmarks)": "4.8",
47
+ "Masked (Accuracy)": "17.4",
48
+ "Masked (Landmarks)": "35.0"
49
+ },
50
+ {
51
+ "Agent": "React-Super",
52
+ "Base model": "Llama 3.1 70B",
53
+ "Expert (Accuracy)": "6.1",
54
+ "Expert (Landmarks)": "9.6",
55
+ "Masked (Accuracy)": "22.8",
56
+ "Masked (Landmarks)": "38.3"
57
+ },
58
+ {
59
+ "Agent": "SWE-Agent",
60
+ "Base model": "Mixtral-8x22B-Instruct-v0.1",
61
+ "Expert (Accuracy)": "1.1",
62
+ "Expert (Landmarks)": "0.0",
63
+ "Masked (Accuracy)": "9.5",
64
+ "Masked (Landmarks)": "26.6"
65
+ },
66
+ {
67
+ "Agent": "React-Super",
68
+ "Base model": "Mixtral-8x22B-Instruct-v0.1",
69
+ "Expert (Accuracy)": "3.3",
70
+ "Expert (Landmarks)": "3.7",
71
+ "Masked (Accuracy)": "7.0",
72
+ "Masked (Landmarks)": "13.2"
73
+ }
74
+ ]
_header.md ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <br/>
2
+
3
+ # SUPER: Evaluating Agents on Setting Up and Executing Tasks from Research Repositories
4
+ <!-- [📑 arxiv](https://arxiv.org/pdf/2409.07440) | -->
5
+ [💻 GitHub](https://github.com/allenai/super-benchmark) | [🤗 HuggingFace](https://huggingface.co/datasets/allenai/super) | Updated: **{LAST_UPDATED}**
6
+
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
2
+ import argparse
3
+ import json
4
+ from datetime import datetime
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import pytz
9
+
10
+ from constants import *
11
+ from constants import column_names
12
+
13
+ # get the last updated time from the elo_ranks.all.jsonl file
14
+ LAST_UPDATED = None
15
+ # with open("_intro.md", "r") as f:
16
+ # INTRO_MD = f.read()
17
+ INTRO_MD = ""
18
+
19
+ with open("_header.md", "r") as f:
20
+ HEADER_MD = f.read()
21
+
22
+ raw_data = None
23
+ original_df = None
24
+
25
+
26
+ def df_filters(mode_selection_radio, show_open_source_model_only):
27
+ global original_df
28
+ original_df.insert(0, "", range(1, 1 + len(original_df)))
29
+ return original_df.copy()
30
+
31
+ def _gstr(text):
32
+ return gr.Text(text, visible=False)
33
+
34
+ def _tab_leaderboard():
35
+ global original_df, available_models
36
+ if True:
37
+ default_mode = "greedy"
38
+ default_main_df = df_filters(default_mode, False)
39
+
40
+ leaderboard_table = gr.components.Dataframe(
41
+ value=default_main_df,
42
+ datatype= ["number", "markdown", "markdown", "number"],
43
+ # max_rows=None,
44
+ height=1000,
45
+ elem_id="leaderboard-table",
46
+ interactive=False,
47
+ visible=True,
48
+ column_widths=[50, 150, 150, 100, 120, 120, 100,100,110,100],
49
+ wrap=True
50
+ # min_width=60,
51
+ )
52
+
53
+ def _tab_submit():
54
+ markdown_text = """
55
+ Please create an issue on our [Github](https://github.com/allenai/super-benchmark) repository with output of trajectories of your model and results. We will update the leaderboard accordingly.
56
+ """
57
+
58
+ gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
59
+
60
+
61
+
62
+ def build_demo():
63
+ global original_df
64
+
65
+ with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
66
+ # convert LAST_UPDATED to the PDT time
67
+ LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
68
+ header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
69
+ gr.Markdown(header_md_text, elem_classes="markdown-text")
70
+
71
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
72
+ with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
73
+ _tab_leaderboard()
74
+ with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
75
+ _tab_submit()
76
+
77
+ return demo
78
+
79
+
80
+
81
+ def data_load(result_file):
82
+ global raw_data, original_df
83
+ print(f"Loading {result_file}")
84
+ column_names_main = column_names.copy()
85
+ # column_names_main.update({})
86
+ main_ordered_columns = ORDERED_COLUMN_NAMES
87
+ # filter the data with Total Puzzles == 1000
88
+
89
+ click_url = True
90
+ # read json file from the result_file
91
+ with open(result_file, "r") as f:
92
+ raw_data = json.load(f)
93
+ # floatify the data, if possible
94
+ for d in raw_data:
95
+ for k, v in d.items():
96
+ try:
97
+ d[k] = float(v)
98
+ except:
99
+ pass
100
+ original_df = pd.DataFrame(raw_data)
101
+
102
+ original_df.sort_values(by="Expert (Accuracy)", ascending=False, inplace=True)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ parser = argparse.ArgumentParser()
107
+ parser.add_argument("--share", action="store_true")
108
+ parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/leaderboard.json")
109
+
110
+ args = parser.parse_args()
111
+ data_load(args.result_file)
112
+
113
+ demo = build_demo()
114
+ demo.launch(share=args.share, height=3000, width="100%")
constants.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from collections import OrderedDict
3
+
4
+ DEFAULT_K = "∞"
5
+
6
+
7
+ column_names = OrderedDict({
8
+ "Model": "Model",
9
+ "Mode": "Mode",
10
+ "Puzzle Acc": "Puzzle Acc",
11
+ "Cell Acc": "Cell Acc",
12
+ "No answer": "No answer",
13
+ "Easy Puzzle Acc": "Easy Puzzle Acc",
14
+ "Hard Puzzle Acc": "Hard Puzzle Acc",
15
+ })
16
+
17
+
18
+ LEADERBOARD_REMARKS_MAIN = """
19
+ """
20
+
21
+ RANKING_COLUMN = "Puzzle Acc"
22
+
23
+ ORDERED_COLUMN_NAMES = [
24
+ "Agent",
25
+ "Base model",
26
+ "Expert (Accuracy)",
27
+ "Expert (Landmarks)",
28
+ "Masked (Accuracy)",
29
+ "Masked (Landmarks)"
30
+ ]
31
+
32
+
33
+ js_light = """
34
+ function refresh() {
35
+ const url = new URL(window.location);
36
+
37
+ if (url.searchParams.get('__theme') !== 'light') {
38
+ url.searchParams.set('__theme', 'light');
39
+ window.location.href = url.href;
40
+ }
41
+
42
+ }
43
+ """
44
+
45
+ css = """
46
+
47
+
48
+
49
+ code {
50
+ font-size: large;
51
+ }
52
+ footer {visibility: hidden}
53
+ .top-left-LP{
54
+ margin-top: 6px;
55
+ margin-left: 5px;
56
+ }
57
+ .no_margin{
58
+ margin-top: 0px;
59
+ margin-left: 0px;
60
+ margin-right: 0px;
61
+ margin-bottom: 0px;
62
+ padding-top: 0px;
63
+ padding-left: 0px;
64
+ padding-right: 0px;
65
+ padding-bottom: 0px;
66
+ }
67
+ .markdown-text{font-size: 14pt}
68
+ .markdown-text-tiny{font-size: 10pt}
69
+ .markdown-text-small{font-size: 13pt}
70
+ .markdown-text-tiny{font-size: 12pt}
71
+ .markdown-text-tiny-red{
72
+ font-size: 12pt;
73
+ color: red;
74
+ background-color: yellow;
75
+ font-color: red;
76
+ font-weight: bold;
77
+ }
78
+ th {
79
+ text-align: center;
80
+ font-size: 17px; /* Adjust the font size as needed */
81
+ }
82
+ td {
83
+ font-size: 15px; /* Adjust the font size as needed */
84
+ text-align: center;
85
+ }
86
+
87
+ .sample_button{
88
+ border: 2px solid #000000;
89
+ border-radius: 10px;
90
+ padding: 10px;
91
+ font-size: 17pt;
92
+ font-weight: bold;
93
+ margin: 5px;
94
+ background-color: #D8BFD8;
95
+ }
96
+
97
+ .chat-common{
98
+ height: auto;
99
+ max-height: 400px;
100
+ min-height: 100px;
101
+ }
102
+ .chat-specific{
103
+ height: auto;
104
+ max-height: 600px;
105
+ min-height: 200px;
106
+ }
107
+ #od-benchmark-tab-table-button{
108
+ font-size: 15pt;
109
+ font-weight: bold;
110
+ }
111
+
112
+ .btn_boderline{
113
+ border: 1px solid #000000;
114
+ border-radius: 5px;
115
+ padding: 5px;
116
+ margin: 5px;
117
+ font-size: 15pt;
118
+ font-weight: bold;
119
+ }
120
+
121
+ .btn_boderline_next{
122
+ border: 0.1px solid #000000;
123
+ border-radius: 5px;
124
+ padding: 5px;
125
+ margin: 5px;
126
+ font-size: 15pt;
127
+ font-weight: bold;
128
+ }
129
+
130
+ .btn_boderline_gray{
131
+ border: 0.5px solid gray;
132
+ border-radius: 5px;
133
+ padding: 5px;
134
+ margin: 5px;
135
+ font-size: 15pt;
136
+ font-weight: italic;
137
+ }
138
+ .btn_boderline_selected{
139
+ border: 2px solid purple;
140
+ background-color: #f2f2f2;
141
+ border-radius: 5px;
142
+ padding: 5px;
143
+ margin: 5px;
144
+ font-size: 15pt;
145
+ font-weight: bold;
146
+ }
147
+ .accordion-label button span{
148
+ font-size: 14pt;
149
+ font-weight: bold;
150
+ }
151
+
152
+ #show-task-categorized span{
153
+ font-size: 13pt;
154
+ font-weight: bold;
155
+ }
156
+
157
+ #show-open-source-models span{
158
+ font-size: 13pt;
159
+ font-weight: bold;
160
+ }
161
+
162
+ #select-models span{
163
+ font-size: 10pt;
164
+ }
165
+
166
+ #select-tasks span{
167
+ font-size: 10pt;
168
+ }
169
+
170
+
171
+ .markdown-text-details{
172
+ margin: 10px;
173
+ padding: 10px;
174
+ }
175
+
176
+
177
+ button.selected[role="tab"][aria-selected="true"] {
178
+ font-size: 18px; /* or any other size you prefer */
179
+ font-weight: bold;
180
+ }
181
+
182
+ #od-benchmark-tab-table-ablation-button {
183
+ font-size: larger; /* Adjust the font size as needed */
184
+ }
185
+
186
+
187
+ .plotly-plot{
188
+ height: auto;
189
+ max-height: 600px;
190
+ min-height: 600px;
191
+ }
192
+
193
+ #length-margin-radio{
194
+ font-size: 10pt;
195
+ # padding: 0px;
196
+ # margin: 1px;
197
+ }
198
+
199
+ #show-task-categorized{
200
+ font-size: 12pt;
201
+ font-decoration: bold;
202
+ }
203
+
204
+ #show-open-source-models{
205
+ font-size: 12pt;
206
+ font-decoration: bold;
207
+ }
208
+
209
+ .box_md{
210
+ border: 1px solid #000000;
211
+ border-radius: 10px;
212
+ padding: 10px;
213
+ font-size: 12pt;
214
+ margin: 5px;
215
+ }
216
+ """
217
+
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio[oauth]==4.19.2
2
+ datasets
3
+ tabulate