Spaces:

allenai
/

super_leaderboard

Running

App Files Files Community

benbogin commited on Feb 10

Commit

507ce38

1 Parent(s): 6633c28

leaderboard

Browse files

Files changed (7) hide show

.gitignore +2 -0
README.md +16 -8
ZeroEval-main/result_dirs/leaderboard.json +74 -0
_header.md +6 -0
app.py +114 -0
constants.py +217 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ *.pyc
2	+ *.DS_Store

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
-title: Super Leaderboard
-emoji: 🔥
-colorFrom: pink
-colorTo: green
 sdk: gradio
-sdk_version: 5.15.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SUPER Leaderboard
+emoji: 🤖
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.19.2
 app_file: app.py
+pinned: true
+fullWidth: true
+hf_oauth: true
+api: false
+tags:
+    - leaderboard
+datasets:
+    - https://huggingface.co/datasets/allenai/super
+models:
+    - meta-llama/Meta-Llama-3-70B-Instruct
+    - mistralai/Mixtral-8x22B-Instruct-v0.1
 ---

ZeroEval-main/result_dirs/leaderboard.json ADDED Viewed

	@@ -0,0 +1,74 @@

+[
+  {
+    "Agent": "SWE-Agent",
+    "Base model": "gpt-4o-2024-08-06",
+    "Expert (Accuracy)": "16.3",
+    "Expert (Landmarks)": "36.8",
+    "Masked (Accuracy)": "46.1",
+    "Masked (Landmarks)": "74.9"
+  },
+  {
+    "Agent": "React",
+    "Base model": "gpt-4o-2024-08-06",
+    "Expert (Accuracy)": "12.2",
+    "Expert (Landmarks)": "33.6",
+    "Masked (Accuracy)": "37.0",
+    "Masked (Landmarks)": "65.7"
+  },
+  {
+    "Agent": "React-Super",
+    "Base model": "gpt-4o-2024-08-06",
+    "Expert (Accuracy)": "14.4",
+    "Expert (Landmarks)": "42.6",
+    "Masked (Accuracy)": "41.6",
+    "Masked (Landmarks)": "72.5"
+  },
+  {
+    "Agent": "SWE-Agent",
+    "Base model": "gpt-4o-mini-2024-07-18",
+    "Expert (Accuracy)": "3.3",
+    "Expert (Landmarks)": "16.1",
+    "Masked (Accuracy)": "27.0",
+    "Masked (Landmarks)": "51.8"
+  },
+  {
+    "Agent": "React-Super",
+    "Base model": "gpt-4o-mini-2024-07-18",
+    "Expert (Accuracy)": "5.6",
+    "Expert (Landmarks)": "20.6",
+    "Masked (Accuracy)": "31.5",
+    "Masked (Landmarks)": "58.3"
+  },
+  {
+    "Agent": "SWE-Agent",
+    "Base model": "Llama 3.1 70B",
+    "Expert (Accuracy)": "5.6",
+    "Expert (Landmarks)": "4.8",
+    "Masked (Accuracy)": "17.4",
+    "Masked (Landmarks)": "35.0"
+  },
+  {
+    "Agent": "React-Super",
+    "Base model": "Llama 3.1 70B",
+    "Expert (Accuracy)": "6.1",
+    "Expert (Landmarks)": "9.6",
+    "Masked (Accuracy)": "22.8",
+    "Masked (Landmarks)": "38.3"
+  },
+  {
+    "Agent": "SWE-Agent",
+    "Base model": "Mixtral-8x22B-Instruct-v0.1",
+    "Expert (Accuracy)": "1.1",
+    "Expert (Landmarks)": "0.0",
+    "Masked (Accuracy)": "9.5",
+    "Masked (Landmarks)": "26.6"
+  },
+  {
+    "Agent": "React-Super",
+    "Base model": "Mixtral-8x22B-Instruct-v0.1",
+    "Expert (Accuracy)": "3.3",
+    "Expert (Landmarks)": "3.7",
+    "Masked (Accuracy)": "7.0",
+    "Masked (Landmarks)": "13.2"
+  }
+]

_header.md ADDED Viewed

	@@ -0,0 +1,6 @@

+<br/>
+# SUPER: Evaluating Agents on Setting Up and Executing Tasks from Research Repositories
+<!-- [📑 arxiv](https://arxiv.org/pdf/2409.07440) |  -->
+ [💻 GitHub](https://github.com/allenai/super-benchmark) | [🤗 HuggingFace](https://huggingface.co/datasets/allenai/super) | Updated: **{LAST_UPDATED}**

app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""A gradio app that renders a static leaderboard. This is used for Hugging Face Space."""
+import argparse
+import json
+from datetime import datetime
+import gradio as gr
+import pandas as pd
+import pytz
+from constants import *
+from constants import column_names
+# get the last updated time from the elo_ranks.all.jsonl file
+LAST_UPDATED = None
+# with open("_intro.md", "r") as f:
+#     INTRO_MD = f.read()
+INTRO_MD = ""
+with open("_header.md", "r") as f:
+    HEADER_MD = f.read()
+raw_data = None
+original_df = None
+def df_filters(mode_selection_radio, show_open_source_model_only):
+    global original_df
+    original_df.insert(0, "", range(1, 1 + len(original_df)))
+    return original_df.copy()
+def _gstr(text):
+    return gr.Text(text, visible=False)
+def _tab_leaderboard():
+    global original_df, available_models
+    if True:
+        default_mode = "greedy"
+        default_main_df = df_filters(default_mode, False)
+        leaderboard_table = gr.components.Dataframe(
+            value=default_main_df,
+            datatype= ["number", "markdown", "markdown", "number"],
+            # max_rows=None,
+            height=1000,
+            elem_id="leaderboard-table",
+            interactive=False,
+            visible=True,
+            column_widths=[50, 150, 150, 100, 120, 120, 100,100,110,100],
+            wrap=True
+            # min_width=60,
+        )
+def _tab_submit():
+    markdown_text = """
+    Please create an issue on our [Github](https://github.com/allenai/super-benchmark) repository with output of trajectories of your model and results. We will update the leaderboard accordingly.
+    """
+    gr.Markdown("## 🚀 Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text")
+def build_demo():
+    global original_df
+    with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
+        # convert LAST_UPDATED to the PDT time
+        LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S")
+        header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED))
+        gr.Markdown(header_md_text, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
+                _tab_leaderboard()
+            with gr.TabItem("🚀 Submit Your Results", elem_id="od-benchmark-tab-table", id=3):
+                _tab_submit()
+    return demo
+def data_load(result_file):
+    global raw_data, original_df
+    print(f"Loading {result_file}")
+    column_names_main = column_names.copy()
+    # column_names_main.update({})
+    main_ordered_columns = ORDERED_COLUMN_NAMES
+    # filter the data with Total Puzzles == 1000
+    click_url = True
+    # read json file from the result_file
+    with open(result_file, "r") as f:
+        raw_data = json.load(f)
+    # floatify the data, if possible
+    for d in raw_data:
+        for k, v in d.items():
+            try:
+                d[k] = float(v)
+            except:
+                pass
+    original_df = pd.DataFrame(raw_data)
+    original_df.sort_values(by="Expert (Accuracy)", ascending=False, inplace=True)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", action="store_true")
+    parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/leaderboard.json")
+    args = parser.parse_args()
+    data_load(args.result_file)
+    demo = build_demo()
+    demo.launch(share=args.share, height=3000, width="100%")

constants.py ADDED Viewed

	@@ -0,0 +1,217 @@

+from pathlib import Path
+from collections import OrderedDict
+DEFAULT_K = "∞"
+column_names = OrderedDict({
+    "Model": "Model",
+    "Mode": "Mode",
+    "Puzzle Acc": "Puzzle Acc",
+    "Cell Acc": "Cell Acc",
+    "No answer": "No answer",
+    "Easy Puzzle Acc": "Easy Puzzle Acc",
+    "Hard Puzzle Acc": "Hard Puzzle Acc",
+})
+LEADERBOARD_REMARKS_MAIN = """
+"""
+RANKING_COLUMN = "Puzzle Acc"
+ORDERED_COLUMN_NAMES = [
+    "Agent",
+    "Base model",
+    "Expert (Accuracy)",
+    "Expert (Landmarks)",
+    "Masked (Accuracy)",
+    "Masked (Landmarks)"
+]
+js_light = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'light') {
+        url.searchParams.set('__theme', 'light');
+        window.location.href = url.href;
+    }
+}
+"""
+css = """
+code {
+    font-size: large;
+}
+footer {visibility: hidden}
+.top-left-LP{
+    margin-top: 6px;
+    margin-left: 5px;
+}
+.no_margin{
+    margin-top: 0px;
+    margin-left: 0px;
+    margin-right: 0px;
+    margin-bottom: 0px;
+    padding-top: 0px;
+    padding-left: 0px;
+    padding-right: 0px;
+    padding-bottom: 0px;
+}
+.markdown-text{font-size: 14pt}
+.markdown-text-tiny{font-size: 10pt}
+.markdown-text-small{font-size: 13pt}
+.markdown-text-tiny{font-size: 12pt}
+.markdown-text-tiny-red{
+    font-size: 12pt;
+    color: red;
+    background-color: yellow;
+    font-color: red;
+    font-weight: bold;
+}
+th {
+  text-align: center;
+  font-size: 17px; /* Adjust the font size as needed */
+}
+td {
+  font-size: 15px; /* Adjust the font size as needed */
+  text-align: center;
+}
+.sample_button{
+    border: 2px solid #000000;
+    border-radius: 10px;
+    padding: 10px;
+    font-size: 17pt;
+    font-weight: bold;
+    margin: 5px;
+    background-color: #D8BFD8;
+}
+.chat-common{
+    height: auto;
+    max-height: 400px;
+    min-height: 100px;
+}
+.chat-specific{
+    height: auto;
+    max-height: 600px;
+    min-height: 200px;
+}
+#od-benchmark-tab-table-button{
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline{
+    border: 1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_next{
+    border: 0.1px solid #000000;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.btn_boderline_gray{
+    border: 0.5px solid gray;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: italic;
+}
+.btn_boderline_selected{
+    border: 2px solid purple;
+    background-color: #f2f2f2;
+    border-radius: 5px;
+    padding: 5px;
+    margin: 5px;
+    font-size: 15pt;
+    font-weight: bold;
+}
+.accordion-label button span{
+    font-size: 14pt;
+    font-weight: bold;
+}
+#show-task-categorized span{
+    font-size: 13pt;
+    font-weight: bold;
+}
+#show-open-source-models span{
+    font-size: 13pt;
+    font-weight: bold;
+}
+#select-models span{
+    font-size: 10pt;
+}
+#select-tasks span{
+    font-size: 10pt;
+}
+.markdown-text-details{
+    margin: 10px;
+    padding: 10px;
+}
+button.selected[role="tab"][aria-selected="true"] {
+    font-size: 18px; /* or any other size you prefer */
+    font-weight: bold;
+}
+#od-benchmark-tab-table-ablation-button {
+    font-size: larger; /* Adjust the font size as needed */
+}
+.plotly-plot{
+    height: auto;
+    max-height: 600px;
+    min-height: 600px;
+}
+#length-margin-radio{
+    font-size: 10pt;
+    # padding: 0px;
+    # margin: 1px;
+}
+#show-task-categorized{
+    font-size: 12pt;
+    font-decoration: bold;
+}
+#show-open-source-models{
+    font-size: 12pt;
+    font-decoration: bold;
+}
+.box_md{
+    border: 1px solid #000000;
+    border-radius: 10px;
+    padding: 10px;
+    font-size: 12pt;
+    margin: 5px;
+}
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio[oauth]==4.19.2
+datasets
+tabulate