eval-leaderboard

Running

App Files Files Community

jwilles commited on Jan 27

Commit

8596ab1

1 Parent(s): 11c3aa7

Update about page

Browse files

Files changed (3) hide show

app.py +1 -1
src/about.py +39 -7
src/display/css_html_js.py +1 -0

app.py CHANGED Viewed

@@ -85,7 +85,7 @@ with demo:
         with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

         with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
+        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

src/about.py CHANGED Viewed

@@ -53,16 +53,48 @@ This leaderboard presents the performance of selected LLM models on a set of tas
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
-## How it works
-The following benchmarks are included:
-Base: {SINGLE_TURN_TASK_NAMES}
-Agentic: {AGENTIC_TASK_NAMES}
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-TBD
 """
 EVALUATION_QUEUE_TEXT = """

 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
+# Vector State of Evaluation Leaderboard
+## Overview
+The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
+- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
+- **Agentic Tasks**: GAIA, GDM-InterCode-CTF
+Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
+## Vector Institute
+The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. This leaderboard is part of Vector’s broader effort to promote transparency and progress in AI research.
+## Model
+We evaluate a variety of **Large Language Models (LLMs)** across the included benchmarks. Each model:
+- Is tested on the same set of tasks.
+- Has standardized prompts or evaluation methodologies.
+- Generates performance metrics (accuracy, F1, etc.) for comparison.
+Our goal is to provide clear, reproducible metrics that shed light on how each model handles different task complexities and reasoning requirements.
+## Benchmarks
+Here is a closer look at each benchmark included in the leaderboard:
+### Base Benchmarks
+- **ARC-Easy / ARC-Challenge**: A set of multiple-choice science questions designed to measure a model’s scientific and commonsense reasoning.
+- **DROP**: A reading comprehension benchmark emphasizing discrete reasoning steps.
+- **WinoGrande**: A commonsense reasoning challenge focused on co-reference resolution.
+- **GSM8K**: Grade-school math word problems testing arithmetic and multi-step reasoning.
+- **HellaSwag**: A commonsense inference task centered on action completion.
+- **HumanEval**: Evaluates code generation and reasoning in a programming context.
+- **IFEval**: A specialized benchmark for incremental formal reasoning.
+- **MATH**: High school-level math questions requiring detailed solutions.
+- **MMLU / MMLU-Pro**: Multi-subject multiple-choice tests covering advanced high school and collegiate-level knowledge.
+- **GPQA-Diamond**: A question-answering benchmark that assesses deeper reasoning and knowledge linking.
+- **MMMU (Multiple-Choice / Open-Ended)**: A suite of multilingual and multi-domain tasks testing both structured and open-form responses.
+### Agentic Benchmarks
+- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
+- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
+---
 """
 EVALUATION_QUEUE_TEXT = """

src/display/css_html_js.py CHANGED Viewed

@@ -94,6 +94,7 @@ custom_css = """
 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """

 #box-filter > .form{
     border: 0
 }
 """
 get_window_url_params = """