Spaces:
Runtime error
Runtime error
| """A gradio app that renders a static leaderboard. This is used for Hugging Face Space.""" | |
| import ast | |
| import argparse | |
| import glob | |
| import pickle | |
| import plotly | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import pandas as pd | |
| from pathlib import Path | |
| import json | |
| from constants import * | |
| from datetime import datetime, timezone | |
| # from datasets import Dataset, load_dataset, concatenate_datasets | |
| import os, uuid | |
| from utils_display import model_info | |
| from constants import column_names, LEADERBOARD_REMARKS, DEFAULT_K, LEADERBOARD_REMARKS_MAIN | |
| import pytz | |
| from data_utils import post_processing, get_random_item | |
| # get the last updated time from the elo_ranks.all.jsonl file | |
| LAST_UPDATED = None | |
| # with open("_intro.md", "r") as f: | |
| # INTRO_MD = f.read() | |
| INTRO_MD = "" | |
| with open("_about_us.md", "r") as f: | |
| ABOUT_MD = f.read() | |
| with open("_header.md", "r") as f: | |
| HEADER_MD = f.read() | |
| with open("_metrics.md", "r") as f: | |
| METRICS_MD = f.read() | |
| raw_data = None | |
| original_df = None | |
| # available_models = [] # to be filled in later | |
| available_models = list(model_info.keys()) | |
| def df_filters(mode_selection_radio, show_open_source_model_only): | |
| global original_df | |
| # remove the rows when the model contains "โ" | |
| original_df = original_df[~original_df["Model"].str.contains("โ")] | |
| modes = { | |
| "greedy": ["greedy"], | |
| "sampling (Temp=0.5)": ["sampling"], | |
| "all": ["greedy", "sampling"] | |
| } | |
| # filter the df by the mode_selection_radio | |
| default_main_df = original_df[original_df["Mode"].isin(modes[mode_selection_radio])] | |
| default_main_df.insert(0, "", range(1, 1 + len(default_main_df))) | |
| return default_main_df.copy() | |
| def _gstr(text): | |
| return gr.Text(text, visible=False) | |
| def _tab_leaderboard(): | |
| global original_df, available_models | |
| # with gr.TabItem("๐ Main", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"): | |
| if True: | |
| default_main_df = original_df.copy() | |
| # default_main_df.insert(0, "", range(1, 1 + len(default_main_df))) | |
| # default_main_df_no_task = default_main_df.copy() | |
| default_mode = "greedy" | |
| default_main_df = df_filters(default_mode, False) | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| mode_selection_radio = gr.Radio(["greedy", "all"], show_label=False, elem_id="rank-column-radio", value=default_mode) | |
| # with gr.Row(): | |
| # with gr.Column(scale=2): | |
| leaderboard_table = gr.components.Dataframe( | |
| value=default_main_df, | |
| datatype= ["number", "markdown", "markdown", "number"], | |
| # max_rows=None, | |
| height=6000, | |
| elem_id="leaderboard-table", | |
| interactive=False, | |
| visible=True, | |
| column_widths=[50, 260, 100, 100, 120, 120, 100,100,110,100], | |
| wrap=True | |
| # min_width=60, | |
| ) | |
| # checkbox_show_task_categorized.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table]) | |
| # show_open_source_model_only.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table]) | |
| # rank_column_radio.change(fn=length_margin_change, inputs=[length_margin_choices, gr.Text("main", visible=False), checkbox_show_task_categorized, show_open_source_model_only, rank_column_radio], outputs=[leaderboard_table]) | |
| mode_selection_radio.change(fn=df_filters, inputs=[mode_selection_radio, _gstr("")], outputs=[leaderboard_table]) | |
| def sample_explore_item(model_name, size_H, size_W): | |
| # print(model_name, size_H, size_W) | |
| explore_item = get_random_item(model_name, size_H, size_W) | |
| if explore_item is None: | |
| return "No item found", "No item found", "No item found", "No item found" | |
| model_name = explore_item['Model'] | |
| example_id = explore_item['id'] | |
| puzzle_md = f"### ๐ฆ Puzzle [{example_id}]:\n\n" + explore_item['puzzle'].replace("## Clues:", "### **Clues:**").replace("\n", "<br>") | |
| model_reasoning_md = f"### ๐ค Reasoning of {model_name}:\n\n {explore_item['reasoning']}" | |
| model_prediction_md = f"### ๐ฌ Answer of {model_name}:\n\n**Json format:** {str(explore_item['solution']).replace('___', 'null')}" + \ | |
| "\n\n**Table format:**\n" + explore_item['solution_table_md'] | |
| puzzle_solved = explore_item['correct_cells'] == explore_item['total_cells'] | |
| cell_acc = explore_item["correct_cells"] / explore_item["total_cells"] * 100 | |
| model_eval_md = f"### ๐ Evaluation:\n\n **Total Cells**: {explore_item['total_cells']} | **Correct Cells**: {explore_item['correct_cells']} | **Puzzle solved**: {puzzle_solved} | **Cell Acc**: {cell_acc:.2f}%" | |
| turht_solution_md = f"### โ Truth Solution:\n\n{explore_item['truth_solution_table']}" | |
| return puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md | |
| def _tab_explore(): | |
| global raw_data | |
| model_names = [item["Model"] for item in raw_data] | |
| # deduplicate and preserve the order | |
| model_names = list(dict.fromkeys(model_names)) | |
| with gr.Row(): | |
| model_selection = gr.Dropdown(choices = ["random"] + model_names, label="Model: ", elem_id="select-models", value="random", interactive=True) | |
| size_H_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Houses", elem_id="select-H", value="random", interactive=True) | |
| size_W_selection = gr.Dropdown(choices = ["random"] + [f"{i}" for i in range(2,7)], label="Num of Features", elem_id="select-W", value="random", interactive=True) | |
| with gr.Column(scale=1): | |
| # greedy_or_sample = gr.Radio(["greedy", "sampling"], show_label=False, elem_id="greedy-or-sample", value="greedy", interactive=True) | |
| gr.Markdown("### ๐ Click below to sample a puzzle. โฌ๏ธ ") | |
| explore_button = gr.Button("๐ฆ Sample a Zebra Puzzle!", elem_id="explore-button") | |
| puzzle_md = gr.Markdown("### ๐ฆ Puzzle: \n\nTo be loaded", elem_id="puzzle-md", elem_classes="box_md") | |
| model_reasoning_md = gr.Markdown("### ๐ค Reasoning: \n\nTo be loaded", elem_id="model-reasoning-md", elem_classes="box_md") | |
| model_prediction_md = gr.Markdown("### ๐ฌ Answer: \n\nTo be loaded", elem_id="model-prediction-md", elem_classes="box_md") | |
| turht_solution_md = gr.Markdown("### โ Truth Solution: \n\nTo be loaded", elem_id="truth-solution-md", elem_classes="box_md") | |
| model_eval_md = gr.Markdown("### ๐ Evaluation: \n\nTo be loaded", elem_id="model-eval-md", elem_classes="box_md") | |
| explore_button.click(fn=sample_explore_item, | |
| inputs=[model_selection, size_H_selection, size_W_selection], | |
| outputs=[puzzle_md, model_reasoning_md, model_prediction_md, model_eval_md, turht_solution_md]) | |
| def _tab_submit(): | |
| markdown_text = """ | |
| Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard. | |
| If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py) | |
| and apply for the access for the [private dataset](https://huggingface.co/datasets/WildEval/ZebraLogic) that contains the truth solutions. | |
| """ | |
| gr.Markdown("## ๐ Submit Your Results\n\n" + markdown_text, elem_classes="markdown-text") | |
| def build_demo(): | |
| global original_df, available_models, gpt4t_dfs, haiku_dfs, llama_dfs | |
| with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo: | |
| gr.HTML(BANNER, elem_id="banner") | |
| # convert LAST_UPDATED to the PDT time | |
| LAST_UPDATED = datetime.now(pytz.timezone('US/Pacific')).strftime("%Y-%m-%d %H:%M:%S") | |
| header_md_text = HEADER_MD.replace("{LAST_UPDATED}", str(LAST_UPDATED)) | |
| gr.Markdown(header_md_text, elem_classes="markdown-text") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("๐ Leaderboard", elem_id="od-benchmark-tab-table", id=0): | |
| _tab_leaderboard() | |
| with gr.TabItem("๐ Explore", elem_id="od-benchmark-tab-table", id=1): | |
| _tab_explore() | |
| with gr.TabItem("๐ Submit Your Results", elem_id="od-benchmark-tab-table", id=3): | |
| _tab_submit() | |
| with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=4): | |
| gr.Markdown(ABOUT_MD, elem_classes="markdown-text") | |
| with gr.Row(): | |
| with gr.Accordion("๐ Citation", open=False, elem_classes="accordion-label"): | |
| gr.Textbox( | |
| value=CITATION_TEXT, | |
| lines=7, | |
| label="Copy the BibTeX snippet to cite this source", | |
| elem_id="citation-button", | |
| show_copy_button=True) | |
| # ).style(show_copy_button=True) | |
| return demo | |
| def data_load(result_file): | |
| global raw_data, original_df | |
| print(f"Loading {result_file}") | |
| column_names_main = column_names.copy() | |
| # column_names_main.update({}) | |
| main_ordered_columns = ORDERED_COLUMN_NAMES | |
| # filter the data with Total Puzzles == 1000 | |
| click_url = True | |
| # read json file from the result_file | |
| with open(result_file, "r") as f: | |
| raw_data = json.load(f) | |
| # floatify the data, if possible | |
| for d in raw_data: | |
| for k, v in d.items(): | |
| try: | |
| d[k] = float(v) | |
| except: | |
| pass | |
| original_df = pd.DataFrame(raw_data) | |
| original_df = original_df[original_df["Total Puzzles"] == 1000] | |
| original_df = post_processing(original_df, column_names_main, ordered_columns=main_ordered_columns, click_url=click_url, rank_column=RANKING_COLUMN) | |
| # print(original_df.columns) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--share", action="store_true") | |
| parser.add_argument("--result_file", help="Path to results table", default="ZeroEval-main/result_dirs/zebra-grid.summary.json") | |
| args = parser.parse_args() | |
| data_load(args.result_file) | |
| print(original_df) | |
| demo = build_demo() | |
| demo.launch(share=args.share, height=3000, width="100%") | |