leaderboard

Running on CPU Upgrade

App Files Files Community

nan commited on May 10, 2024

Commit

36c5a0c

1 Parent(s): f30cbcc

feat: implement the submission part

Browse files

Files changed (4) hide show

app.py +53 -9
src/about.py +5 -20
src/populate.py +54 -32
utils.py +16 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
@@ -13,13 +14,14 @@ from src.display.utils import (
     LONG_DOC_BENCHMARK_COLS,
     COLS_QA,
     COLS_LONG_DOC,
     TYPES,
     AutoEvalColumnQA,
     fields
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_leaderboard_df
-from utils import update_table, update_metric, update_table_long_doc
 from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
@@ -75,11 +77,11 @@ def update_metric_long_doc(
     return update_metric(raw_data_qa, 'long_doc', metric, domains, langs, reranking_model, query)
-# (
-#     finished_eval_queue_df,
-#     running_eval_queue_df,
-#     pending_eval_queue_df,
-# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 demo = gr.Blocks(css=custom_css)
@@ -305,8 +307,50 @@ with demo:
                 queue=True
             )
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)

     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    EVALUATION_QUEUE_TEXT
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     LONG_DOC_BENCHMARK_COLS,
     COLS_QA,
     COLS_LONG_DOC,
+    EVAL_COLS,
     TYPES,
     AutoEvalColumnQA,
     fields
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_leaderboard_df, get_evaluation_queue_df
+from utils import update_table, update_metric, update_table_long_doc, upload_file
 from src.benchmarks import DOMAIN_COLS_QA, LANG_COLS_QA, DOMAIN_COLS_LONG_DOC, LANG_COLS_LONG_DOC, metric_list
     return update_metric(raw_data_qa, 'long_doc', metric, domains, langs, reranking_model, query)
+(
+    finished_eval_queue_df,
+    running_eval_queue_df,
+    pending_eval_queue_df,
+) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 demo = gr.Blocks(css=custom_css)
                 queue=True
             )
+        with gr.TabItem("🚀Submit here!", elem_id="submit-tab-table", id=2):
+            with gr.Column():
+                with gr.Row():
+                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Row():
+                    with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
+                        with gr.Row():
+                            finished_eval_table = gr.components.Dataframe(
+                                value=finished_eval_queue_df,
+                                row_count=5,
+                            )
+                with gr.Row():
+                    with gr.Accordion(
+                            f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
+                            open=False,
+                    ):
+                        with gr.Row():
+                            running_eval_table = gr.components.Dataframe(
+                                value=running_eval_queue_df,
+                                row_count=5,
+                            )
+                with gr.Row():
+                    with gr.Accordion(
+                            f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
+                            open=False,
+                    ):
+                        with gr.Row():
+                            pending_eval_table = gr.components.Dataframe(
+                                value=pending_eval_queue_df,
+                                row_count=5,
+                            )
+                with gr.Row():
+                    gr.Markdown("## ✉️Submit your model here!", elem_classes="markdown-text")
+                # with gr.Row():
+                #     with gr.Column():
+                #         model_name_textbox = gr.Textbox(label="Model name")
+                #     with gr.Column():
+                #         model_url = gr.Textbox(label="Model URL")
+                    file_output = gr.File()
+                    upload_button = gr.UploadButton("Click to submit evaluation", file_count="multiple")
+                    upload_button.upload(upload_file, upload_button, file_output)
+        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
+        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)

src/about.py CHANGED Viewed

@@ -57,26 +57,11 @@ To reproduce our results, here is the commands you can run:
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.

 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
+### 1)
+### 2)
+### 3)
+### 4)
 ## In case of model failure
 If your model is displayed in the `FAILED` category, its execution stopped.

src/populate.py CHANGED Viewed

@@ -38,35 +38,57 @@ def get_leaderboard_df(raw_data: List[FullEvalResult], cols: list, benchmark_col
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requests"""
-    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
-    all_evals = []
-    for entry in entries:
-        if ".json" in entry:
-            file_path = os.path.join(save_path, entry)
-            with open(file_path) as fp:
-                data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-            data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-            all_evals.append(data)
-        elif ".md" not in entry:
-            # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
-    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
-    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
-    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
-    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
-    df_running = pd.DataFrame.from_records(running_list, columns=cols)
-    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requests"""
+    # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
+    # all_evals = []
+    #
+    # for entry in entries:
+    #     if ".json" in entry:
+    #         file_path = os.path.join(save_path, entry)
+    #         with open(file_path) as fp:
+    #             data = json.load(fp)
+    #
+    #         data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+    #         data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+    #
+    #         all_evals.append(data)
+    #     elif ".md" not in entry:
+    #         # this is a folder
+    #         sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+    #         for sub_entry in sub_entries:
+    #             file_path = os.path.join(save_path, entry, sub_entry)
+    #             with open(file_path) as fp:
+    #                 data = json.load(fp)
+    #
+    #             data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+    #             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+    #             all_evals.append(data)
+    #
+    # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
+    # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
+    # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
+    # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
+    # df_running = pd.DataFrame.from_records(running_list, columns=cols)
+    # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    cols = ["Retrieval Model", "Submitted Time", "Status"]
+    df_finished = pd.DataFrame(
+        {
+            "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
+            "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
+            "Status": ["FINISHED", "FINISHED"]
+        }
+    )
+    df_running = pd.DataFrame(
+        {
+            "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
+            "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
+            "Status": ["RUNNING", "RUNNING"]
+        }
+    )
+    df_pending = pd.DataFrame(
+        {
+            "Retrieval Model": ["bge-m3", "jina-embeddings-v2"],
+            "Submitted Time": ["2024-05-01 12:34:20", "2024-05-02 12:34:20"],
+            "Status": ["PENDING", "PENDING"]
+        }
+    )
+    return df_finished, df_running, df_pending

utils.py CHANGED Viewed

@@ -1,4 +1,9 @@
 import pandas as pd
 from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
 from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
@@ -124,3 +129,14 @@ def update_metric(
             reranking_model,
             query
         )

 import pandas as pd
+import os
+from src.display.formatting import styled_error, styled_message, styled_warning
+from huggingface_hub import HfApi
 from src.display.utils import AutoEvalColumnQA, AutoEvalColumnLongDoc, COLS_QA, COLS_LONG_DOC, QA_BENCHMARK_COLS, LONG_DOC_BENCHMARK_COLS
 from src.benchmarks import BENCHMARK_COLS_QA, BENCHMARK_COLS_LONG_DOC, BenchmarksQA, BenchmarksLongDoc
             reranking_model,
             query
         )
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    print(f"file uploaded: {file_paths}")
+    # for fp in file_paths:
+    #     # upload the file
+    #     print(file_paths)
+    #     HfApi(token="").upload_file(...)
+    #     os.remove(fp)
+    return file_paths