Update about page
Browse files- app.py +1 -1
- src/about.py +39 -7
- src/display/css_html_js.py +1 -0
app.py
CHANGED
|
@@ -85,7 +85,7 @@ with demo:
|
|
| 85 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
| 86 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
| 87 |
|
| 88 |
-
with gr.TabItem("
|
| 89 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 90 |
|
| 91 |
|
|
|
|
| 85 |
with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
|
| 86 |
leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
|
| 87 |
|
| 88 |
+
with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
|
| 89 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
| 90 |
|
| 91 |
|
src/about.py
CHANGED
|
@@ -53,16 +53,48 @@ This leaderboard presents the performance of selected LLM models on a set of tas
|
|
| 53 |
|
| 54 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 55 |
LLM_BENCHMARKS_TEXT = f"""
|
| 56 |
-
|
| 57 |
-
The following benchmarks are included:
|
| 58 |
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
"""
|
| 67 |
|
| 68 |
EVALUATION_QUEUE_TEXT = """
|
|
|
|
| 53 |
|
| 54 |
# Which evaluations are you running? how can people reproduce what you have?
|
| 55 |
LLM_BENCHMARKS_TEXT = f"""
|
| 56 |
+
# Vector State of Evaluation Leaderboard
|
|
|
|
| 57 |
|
| 58 |
+
## Overview
|
| 59 |
+
The **Vector State of Evaluation Leaderboard** presents the performance of selected LLM models on a variety of tasks. These tasks are divided into two categories:
|
| 60 |
|
| 61 |
+
- **Base Tasks**: ARC-Easy, ARC-Challenge, DROP, WinoGrande, GSM8K, HellaSwag, HumanEval, IFEval, MATH, MMLU, MMLU-Pro, GPQA-Diamond, MMMU-Multiple-Choice, MMMU-Open-Ended
|
| 62 |
+
- **Agentic Tasks**: GAIA, GDM-InterCode-CTF
|
| 63 |
|
| 64 |
+
Users can compare models side by side to see how they perform on both base-level understanding tasks and more advanced, “agentic” tasks.
|
| 65 |
+
|
| 66 |
+
## Vector Institute
|
| 67 |
+
The **Vector Institute** is dedicated to advancing the fields of artificial intelligence and machine learning through cutting-edge research, collaborative projects, and open-source contributions. This leaderboard is part of Vector’s broader effort to promote transparency and progress in AI research.
|
| 68 |
+
|
| 69 |
+
## Model
|
| 70 |
+
We evaluate a variety of **Large Language Models (LLMs)** across the included benchmarks. Each model:
|
| 71 |
+
- Is tested on the same set of tasks.
|
| 72 |
+
- Has standardized prompts or evaluation methodologies.
|
| 73 |
+
- Generates performance metrics (accuracy, F1, etc.) for comparison.
|
| 74 |
+
|
| 75 |
+
Our goal is to provide clear, reproducible metrics that shed light on how each model handles different task complexities and reasoning requirements.
|
| 76 |
+
|
| 77 |
+
## Benchmarks
|
| 78 |
+
Here is a closer look at each benchmark included in the leaderboard:
|
| 79 |
+
|
| 80 |
+
### Base Benchmarks
|
| 81 |
+
- **ARC-Easy / ARC-Challenge**: A set of multiple-choice science questions designed to measure a model’s scientific and commonsense reasoning.
|
| 82 |
+
- **DROP**: A reading comprehension benchmark emphasizing discrete reasoning steps.
|
| 83 |
+
- **WinoGrande**: A commonsense reasoning challenge focused on co-reference resolution.
|
| 84 |
+
- **GSM8K**: Grade-school math word problems testing arithmetic and multi-step reasoning.
|
| 85 |
+
- **HellaSwag**: A commonsense inference task centered on action completion.
|
| 86 |
+
- **HumanEval**: Evaluates code generation and reasoning in a programming context.
|
| 87 |
+
- **IFEval**: A specialized benchmark for incremental formal reasoning.
|
| 88 |
+
- **MATH**: High school-level math questions requiring detailed solutions.
|
| 89 |
+
- **MMLU / MMLU-Pro**: Multi-subject multiple-choice tests covering advanced high school and collegiate-level knowledge.
|
| 90 |
+
- **GPQA-Diamond**: A question-answering benchmark that assesses deeper reasoning and knowledge linking.
|
| 91 |
+
- **MMMU (Multiple-Choice / Open-Ended)**: A suite of multilingual and multi-domain tasks testing both structured and open-form responses.
|
| 92 |
+
|
| 93 |
+
### Agentic Benchmarks
|
| 94 |
+
- **GAIA**: Evaluates more autonomous or “agentic” reasoning, including planning and problem-solving.
|
| 95 |
+
- **GDM-InterCode-CTF**: A capture-the-flag style challenge focusing on code interpretation and generative debugging strategies.
|
| 96 |
+
|
| 97 |
+
---
|
| 98 |
"""
|
| 99 |
|
| 100 |
EVALUATION_QUEUE_TEXT = """
|
src/display/css_html_js.py
CHANGED
|
@@ -94,6 +94,7 @@ custom_css = """
|
|
| 94 |
#box-filter > .form{
|
| 95 |
border: 0
|
| 96 |
}
|
|
|
|
| 97 |
"""
|
| 98 |
|
| 99 |
get_window_url_params = """
|
|
|
|
| 94 |
#box-filter > .form{
|
| 95 |
border: 0
|
| 96 |
}
|
| 97 |
+
|
| 98 |
"""
|
| 99 |
|
| 100 |
get_window_url_params = """
|