Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
loading_from_contents
#766
by
clefourrier
HF staff
- opened
- README.md +1 -2
- app.py +129 -79
- requirements.txt +1 -1
- src/display/about.py +2 -2
- src/display/utils.py +1 -0
- src/envs.py +4 -18
- src/leaderboard/filter_models.py +122 -110
- src/leaderboard/read_evals.py +0 -261
- src/populate.py +8 -7
- src/scripts/update_all_request_files.py +0 -129
- src/submission/check_validity.py +1 -1
- src/submission/submit.py +3 -30
- src/tools/collections.py +0 -76
- src/{scripts → tools}/create_request_file.py +0 -0
- src/tools/model_backlinks.py +2 -2
- src/tools/plots.py +7 -13
README.md
CHANGED
@@ -8,14 +8,13 @@ sdk_version: 4.9.0
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
11 |
-
duplicated_from: HuggingFaceH4/open_llm_leaderboard
|
12 |
fullWidth: true
|
13 |
startup_duration_timeout: 1h
|
14 |
space_ci:
|
15 |
private: true
|
16 |
secrets:
|
17 |
- HF_TOKEN
|
18 |
-
-
|
19 |
tags:
|
20 |
- leaderboard
|
21 |
short_description: Track, rank and evaluate open LLMs and chatbots
|
|
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
license: apache-2.0
|
|
|
11 |
fullWidth: true
|
12 |
startup_duration_timeout: 1h
|
13 |
space_ci:
|
14 |
private: true
|
15 |
secrets:
|
16 |
- HF_TOKEN
|
17 |
+
- WEBHOOK_SECRET
|
18 |
tags:
|
19 |
- leaderboard
|
20 |
short_description: Track, rank and evaluate open LLMs and chatbots
|
app.py
CHANGED
@@ -2,10 +2,9 @@ import os
|
|
2 |
import logging
|
3 |
import time
|
4 |
import gradio as gr
|
5 |
-
|
6 |
-
from huggingface_hub import snapshot_download
|
7 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
8 |
-
from gradio_space_ci import enable_space_ci
|
9 |
|
10 |
from src.display.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
@@ -30,32 +29,27 @@ from src.display.utils import (
|
|
30 |
)
|
31 |
from src.envs import (
|
32 |
API,
|
33 |
-
DYNAMIC_INFO_FILE_PATH,
|
34 |
-
DYNAMIC_INFO_PATH,
|
35 |
-
DYNAMIC_INFO_REPO,
|
36 |
EVAL_REQUESTS_PATH,
|
37 |
-
|
38 |
-
|
39 |
-
IS_PUBLIC,
|
40 |
QUEUE_REPO,
|
41 |
REPO_ID,
|
42 |
-
|
43 |
)
|
44 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
45 |
-
from src.scripts.update_all_request_files import update_dynamic_files
|
46 |
from src.submission.submit import add_new_eval
|
47 |
-
from src.tools.collections import update_collections
|
48 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
49 |
|
50 |
# Configure logging
|
51 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
52 |
|
53 |
-
# Start ephemeral Spaces on PRs (see config in README.md)
|
54 |
-
enable_space_ci()
|
55 |
|
|
|
|
|
|
|
56 |
|
57 |
def restart_space():
|
58 |
-
API.restart_space(repo_id=REPO_ID, token=
|
59 |
|
60 |
|
61 |
def time_diff_wrapper(func):
|
@@ -94,54 +88,90 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
94 |
attempt += 1
|
95 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
-
def init_space(
|
99 |
"""Initializes the application space, loading only necessary data."""
|
100 |
-
if
|
101 |
# These downloads only occur on full initialization
|
102 |
try:
|
103 |
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
104 |
-
download_dataset(DYNAMIC_INFO_REPO, DYNAMIC_INFO_PATH)
|
105 |
-
download_dataset(RESULTS_REPO, EVAL_RESULTS_PATH)
|
106 |
except Exception:
|
107 |
restart_space()
|
108 |
|
109 |
-
# Always
|
110 |
-
|
111 |
-
results_path=EVAL_RESULTS_PATH,
|
112 |
-
requests_path=EVAL_REQUESTS_PATH,
|
113 |
-
dynamic_path=DYNAMIC_INFO_FILE_PATH,
|
114 |
-
cols=COLS,
|
115 |
-
benchmark_cols=BENCHMARK_COLS,
|
116 |
-
)
|
117 |
-
|
118 |
-
if full_init:
|
119 |
-
# Collection update only happens on full initialization
|
120 |
-
update_collections(original_df)
|
121 |
-
|
122 |
-
leaderboard_df = original_df.copy()
|
123 |
|
124 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
125 |
-
eval_queue_dfs =
|
126 |
|
127 |
-
return leaderboard_df,
|
128 |
|
129 |
|
130 |
-
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
131 |
-
# This controls whether a full initialization should be performed.
|
132 |
-
do_full_init = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
133 |
-
|
134 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
135 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
136 |
-
leaderboard_df,
|
137 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
138 |
|
139 |
|
140 |
# Data processing for plots now only on demand in the respective Gradio tab
|
141 |
def load_and_create_plots():
|
142 |
-
plot_df = create_plot_df(create_scores_df(
|
143 |
return plot_df
|
144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
|
146 |
demo = gr.Blocks(css=custom_css)
|
147 |
with demo:
|
@@ -150,37 +180,7 @@ with demo:
|
|
150 |
|
151 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
152 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
153 |
-
leaderboard =
|
154 |
-
value=leaderboard_df,
|
155 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
156 |
-
select_columns=SelectColumns(
|
157 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
158 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
159 |
-
label="Select Columns to Display:",
|
160 |
-
),
|
161 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
162 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
163 |
-
filter_columns=[
|
164 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
165 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
166 |
-
ColumnFilter(
|
167 |
-
AutoEvalColumn.params.name,
|
168 |
-
type="slider",
|
169 |
-
min=0.01,
|
170 |
-
max=150,
|
171 |
-
label="Select the number of parameters (B)",
|
172 |
-
),
|
173 |
-
ColumnFilter(
|
174 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
|
175 |
-
),
|
176 |
-
ColumnFilter(
|
177 |
-
AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
|
178 |
-
),
|
179 |
-
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
180 |
-
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
181 |
-
],
|
182 |
-
bool_checkboxgroup_label="Hide models",
|
183 |
-
)
|
184 |
|
185 |
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
186 |
with gr.Row():
|
@@ -219,7 +219,6 @@ with demo:
|
|
219 |
with gr.Column():
|
220 |
model_name_textbox = gr.Textbox(label="Model name")
|
221 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
222 |
-
private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
|
223 |
model_type = gr.Dropdown(
|
224 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
225 |
label="Model type",
|
@@ -290,7 +289,6 @@ with demo:
|
|
290 |
base_model_name_textbox,
|
291 |
revision_name_textbox,
|
292 |
precision,
|
293 |
-
private,
|
294 |
weight_type,
|
295 |
model_type,
|
296 |
],
|
@@ -307,9 +305,61 @@ with demo:
|
|
307 |
show_copy_button=True,
|
308 |
)
|
309 |
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import logging
|
3 |
import time
|
4 |
import gradio as gr
|
5 |
+
import datasets
|
6 |
+
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
7 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
|
|
8 |
|
9 |
from src.display.about import (
|
10 |
CITATION_BUTTON_LABEL,
|
|
|
29 |
)
|
30 |
from src.envs import (
|
31 |
API,
|
|
|
|
|
|
|
32 |
EVAL_REQUESTS_PATH,
|
33 |
+
AGGREGATED_REPO,
|
34 |
+
HF_TOKEN,
|
|
|
35 |
QUEUE_REPO,
|
36 |
REPO_ID,
|
37 |
+
HF_HOME,
|
38 |
)
|
39 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
40 |
from src.submission.submit import add_new_eval
|
|
|
41 |
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
42 |
|
43 |
# Configure logging
|
44 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
45 |
|
|
|
|
|
46 |
|
47 |
+
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
48 |
+
# This controls whether a full initialization should be performed.
|
49 |
+
DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
50 |
|
51 |
def restart_space():
|
52 |
+
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
53 |
|
54 |
|
55 |
def time_diff_wrapper(func):
|
|
|
88 |
attempt += 1
|
89 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
90 |
|
91 |
+
def get_latest_data_leaderboard():
|
92 |
+
leaderboard_dataset = datasets.load_dataset(
|
93 |
+
AGGREGATED_REPO,
|
94 |
+
"default",
|
95 |
+
split="train",
|
96 |
+
cache_dir=HF_HOME,
|
97 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
98 |
+
verification_mode="no_checks"
|
99 |
+
)
|
100 |
+
|
101 |
+
leaderboard_df = get_leaderboard_df(
|
102 |
+
leaderboard_dataset=leaderboard_dataset,
|
103 |
+
cols=COLS,
|
104 |
+
benchmark_cols=BENCHMARK_COLS,
|
105 |
+
)
|
106 |
+
|
107 |
+
return leaderboard_df
|
108 |
+
|
109 |
+
def get_latest_data_queue():
|
110 |
+
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
111 |
+
return eval_queue_dfs
|
112 |
|
113 |
+
def init_space():
|
114 |
"""Initializes the application space, loading only necessary data."""
|
115 |
+
if DO_FULL_INIT:
|
116 |
# These downloads only occur on full initialization
|
117 |
try:
|
118 |
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
|
|
|
|
119 |
except Exception:
|
120 |
restart_space()
|
121 |
|
122 |
+
# Always redownload the leaderboard DataFrame
|
123 |
+
leaderboard_df = get_latest_data_leaderboard()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
126 |
+
eval_queue_dfs = get_latest_data_queue()
|
127 |
|
128 |
+
return leaderboard_df, eval_queue_dfs
|
129 |
|
130 |
|
|
|
|
|
|
|
|
|
131 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
132 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
133 |
+
leaderboard_df, eval_queue_dfs = init_space()
|
134 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
135 |
|
136 |
|
137 |
# Data processing for plots now only on demand in the respective Gradio tab
|
138 |
def load_and_create_plots():
|
139 |
+
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
140 |
return plot_df
|
141 |
|
142 |
+
def init_leaderboard(dataframe):
|
143 |
+
return Leaderboard(
|
144 |
+
value = dataframe,
|
145 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
146 |
+
select_columns=SelectColumns(
|
147 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
148 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
149 |
+
label="Select Columns to Display:",
|
150 |
+
),
|
151 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
152 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
153 |
+
filter_columns=[
|
154 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
155 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
156 |
+
ColumnFilter(
|
157 |
+
AutoEvalColumn.params.name,
|
158 |
+
type="slider",
|
159 |
+
min=0.01,
|
160 |
+
max=150,
|
161 |
+
label="Select the number of parameters (B)",
|
162 |
+
),
|
163 |
+
ColumnFilter(
|
164 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
|
165 |
+
),
|
166 |
+
ColumnFilter(
|
167 |
+
AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
|
168 |
+
),
|
169 |
+
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
170 |
+
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
171 |
+
],
|
172 |
+
bool_checkboxgroup_label="Hide models",
|
173 |
+
)
|
174 |
+
|
175 |
|
176 |
demo = gr.Blocks(css=custom_css)
|
177 |
with demo:
|
|
|
180 |
|
181 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
182 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
183 |
+
leaderboard = init_leaderboard(leaderboard_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
|
185 |
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
186 |
with gr.Row():
|
|
|
219 |
with gr.Column():
|
220 |
model_name_textbox = gr.Textbox(label="Model name")
|
221 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
|
|
222 |
model_type = gr.Dropdown(
|
223 |
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
224 |
label="Model type",
|
|
|
289 |
base_model_name_textbox,
|
290 |
revision_name_textbox,
|
291 |
precision,
|
|
|
292 |
weight_type,
|
293 |
model_type,
|
294 |
],
|
|
|
305 |
show_copy_button=True,
|
306 |
)
|
307 |
|
308 |
+
demo.load(fn=get_latest_data_leaderboard, inputs=None, outputs=[leaderboard])
|
309 |
+
demo.load(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
310 |
+
|
311 |
+
demo.queue(default_concurrency_limit=40)
|
312 |
+
|
313 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
314 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
315 |
+
|
316 |
+
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
317 |
+
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
318 |
+
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
319 |
+
# ht to Lucain!
|
320 |
+
if SPACE_ID is None:
|
321 |
+
print("Not in a Space: Space CI disabled.")
|
322 |
+
return WebhooksServer(ui=demo)
|
323 |
+
|
324 |
+
if IS_EPHEMERAL_SPACE:
|
325 |
+
print("In an ephemeral Space: Space CI disabled.")
|
326 |
+
return WebhooksServer(ui=demo)
|
327 |
+
|
328 |
+
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
329 |
+
config = card.data.get("space_ci", {})
|
330 |
+
print(f"Enabling Space CI with config from README: {config}")
|
331 |
+
|
332 |
+
return configure_space_ci(
|
333 |
+
blocks=ui,
|
334 |
+
trusted_authors=config.get("trusted_authors"),
|
335 |
+
private=config.get("private", "auto"),
|
336 |
+
variables=config.get("variables", "auto"),
|
337 |
+
secrets=config.get("secrets"),
|
338 |
+
hardware=config.get("hardware"),
|
339 |
+
storage=config.get("storage"),
|
340 |
+
)
|
341 |
|
342 |
+
# Create webhooks server (with CI url if in Space and not ephemeral)
|
343 |
+
webhooks_server = enable_space_ci_and_return_server(ui=demo)
|
344 |
+
|
345 |
+
# Add webhooks
|
346 |
+
@webhooks_server.add_webhook
|
347 |
+
async def update_leaderboard(payload: WebhookPayload) -> None:
|
348 |
+
"""Redownloads the leaderboard dataset each time it updates"""
|
349 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
350 |
+
datasets.load_dataset(
|
351 |
+
AGGREGATED_REPO,
|
352 |
+
"default",
|
353 |
+
split="train",
|
354 |
+
cache_dir=HF_HOME,
|
355 |
+
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
356 |
+
verification_mode="no_checks"
|
357 |
+
)
|
358 |
+
|
359 |
+
@webhooks_server.add_webhook
|
360 |
+
async def update_queue(payload: WebhookPayload) -> None:
|
361 |
+
"""Redownloads the queue dataset each time it updates"""
|
362 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
363 |
+
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
364 |
+
|
365 |
+
webhooks_server.launch()
|
requirements.txt
CHANGED
@@ -15,4 +15,4 @@ transformers==4.41.1
|
|
15 |
tokenizers>=0.15.0
|
16 |
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
|
17 |
gradio==4.20.0
|
18 |
-
gradio_leaderboard==0.0.
|
|
|
15 |
tokenizers>=0.15.0
|
16 |
gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
|
17 |
gradio==4.20.0
|
18 |
+
gradio_leaderboard==0.0.9
|
src/display/about.py
CHANGED
@@ -81,7 +81,7 @@ To get more information about quantization, see:
|
|
81 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
82 |
|
83 |
### Useful links
|
84 |
-
- [Community resources](https://huggingface.co/spaces/
|
85 |
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
|
86 |
|
87 |
### Other cool leaderboards:
|
@@ -217,7 +217,7 @@ CITATION_BUTTON_TEXT = r"""
|
|
217 |
title = {Open LLM Leaderboard},
|
218 |
year = {2023},
|
219 |
publisher = {Hugging Face},
|
220 |
-
howpublished = "\url{https://huggingface.co/spaces/
|
221 |
}
|
222 |
@software{eval-harness,
|
223 |
author = {Gao, Leo and
|
|
|
81 |
- 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
|
82 |
|
83 |
### Useful links
|
84 |
+
- [Community resources](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/174)
|
85 |
- [Collection of best models](https://huggingface.co/collections/open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03)
|
86 |
|
87 |
### Other cool leaderboards:
|
|
|
217 |
title = {Open LLM Leaderboard},
|
218 |
year = {2023},
|
219 |
publisher = {Hugging Face},
|
220 |
+
howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
|
221 |
}
|
222 |
@software{eval-harness,
|
223 |
author = {Gao, Leo and
|
src/display/utils.py
CHANGED
@@ -93,6 +93,7 @@ auto_eval_column_dict.append(
|
|
93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
|
|
96 |
# Dummy column for the search bar (hidden by the custom CSS)
|
97 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
98 |
|
|
|
93 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
94 |
auto_eval_column_dict.append(["not_flagged", ColumnContent, ColumnContent("Flagged", "bool", False, hidden=True)])
|
95 |
auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
|
96 |
+
auto_eval_column_dict.append(["date", ColumnContent, ColumnContent("date", "bool", False, hidden=True)])
|
97 |
# Dummy column for the search bar (hidden by the custom CSS)
|
98 |
auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname", "str", False, dummy=True)])
|
99 |
|
src/envs.py
CHANGED
@@ -2,17 +2,11 @@ import os
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# clone / pull the lmeh eval data
|
5 |
-
|
6 |
|
7 |
-
REPO_ID = "
|
8 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
9 |
-
|
10 |
-
RESULTS_REPO = "open-llm-leaderboard/results"
|
11 |
-
|
12 |
-
PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests"
|
13 |
-
PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
|
14 |
-
|
15 |
-
IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
|
16 |
|
17 |
HF_HOME = os.getenv("HF_HOME", ".")
|
18 |
|
@@ -27,18 +21,10 @@ else:
|
|
27 |
print("Write access confirmed for HF_HOME")
|
28 |
|
29 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
30 |
-
EVAL_RESULTS_PATH = os.path.join(HF_HOME, "eval-results")
|
31 |
-
DYNAMIC_INFO_PATH = os.path.join(HF_HOME, "dynamic-info")
|
32 |
-
DYNAMIC_INFO_FILE_PATH = os.path.join(DYNAMIC_INFO_PATH, "model_infos.json")
|
33 |
-
|
34 |
-
EVAL_REQUESTS_PATH_PRIVATE = "eval-queue-private"
|
35 |
-
EVAL_RESULTS_PATH_PRIVATE = "eval-results-private"
|
36 |
-
|
37 |
-
PATH_TO_COLLECTION = "open-llm-leaderboard/llm-leaderboard-best-models-652d6c7965a4619fb5c27a03"
|
38 |
|
39 |
# Rate limit variables
|
40 |
RATE_LIMIT_PERIOD = 7
|
41 |
RATE_LIMIT_QUOTA = 5
|
42 |
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
43 |
|
44 |
-
API = HfApi(token=
|
|
|
2 |
from huggingface_hub import HfApi
|
3 |
|
4 |
# clone / pull the lmeh eval data
|
5 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
6 |
|
7 |
+
REPO_ID = "open-llm-leaderboard/open_llm_leaderboard"
|
8 |
QUEUE_REPO = "open-llm-leaderboard/requests"
|
9 |
+
AGGREGATED_REPO = "open-llm-leaderboard/contents"
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
HF_HOME = os.getenv("HF_HOME", ".")
|
12 |
|
|
|
21 |
print("Write access confirmed for HF_HOME")
|
22 |
|
23 |
EVAL_REQUESTS_PATH = os.path.join(HF_HOME, "eval-queue")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
# Rate limit variables
|
26 |
RATE_LIMIT_PERIOD = 7
|
27 |
RATE_LIMIT_QUOTA = 5
|
28 |
HAS_HIGHER_RATE_LIMIT = ["TheBloke"]
|
29 |
|
30 |
+
API = HfApi(token=HF_TOKEN)
|
src/leaderboard/filter_models.py
CHANGED
@@ -5,120 +5,120 @@ from src.display.utils import AutoEvalColumn
|
|
5 |
# Models which have been flagged by users as being problematic for a reason or another
|
6 |
# (Model name to forum discussion link)
|
7 |
FLAGGED_MODELS = {
|
8 |
-
"merged": "https://huggingface.co/spaces/
|
9 |
-
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/
|
10 |
-
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/
|
11 |
-
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/
|
12 |
-
"Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/
|
13 |
-
"TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/
|
14 |
-
"gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/
|
15 |
-
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/
|
16 |
-
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/
|
17 |
-
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/
|
18 |
-
"fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/
|
19 |
-
"jan-hq/trinity-v1": "https://huggingface.co/spaces/
|
20 |
-
"rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/
|
21 |
-
"rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/
|
22 |
-
"GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/
|
23 |
-
"GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/
|
24 |
-
"GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/
|
25 |
-
"viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/
|
26 |
-
"GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/
|
27 |
-
"janai-hq/trinity-v1": "https://huggingface.co/spaces/
|
28 |
-
"ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/
|
29 |
-
"fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/
|
30 |
-
"mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/
|
31 |
-
"mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/
|
32 |
-
"Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/
|
33 |
-
"GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/
|
34 |
-
"quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/
|
35 |
-
"quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/
|
36 |
-
"quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/
|
37 |
-
"mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/
|
38 |
-
"cookinai/BruinHermes": "https://huggingface.co/spaces/
|
39 |
-
"jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/
|
40 |
-
"v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/
|
41 |
-
"v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/
|
42 |
-
"rwitz2/pee": "https://huggingface.co/spaces/
|
43 |
-
"zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/
|
44 |
-
"dillfrescott/trinity-medium": "https://huggingface.co/spaces/
|
45 |
-
"udkai/Garrulus": "https://huggingface.co/spaces/
|
46 |
"dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
|
47 |
-
"eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/
|
48 |
-
"abideen/NexoNimbus-7B": "https://huggingface.co/spaces/
|
49 |
-
"alnrg2arg/test2_3": "https://huggingface.co/spaces/
|
50 |
-
"nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/
|
51 |
-
"CultriX/MergeTrix-7B": "https://huggingface.co/spaces/
|
52 |
-
"liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/
|
53 |
# Merges not indicated
|
54 |
-
"gagan3012/MetaModelv2": "https://huggingface.co/spaces/
|
55 |
-
"gagan3012/MetaModelv3": "https://huggingface.co/spaces/
|
56 |
-
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/
|
57 |
-
"kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/
|
58 |
-
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/
|
59 |
-
"kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/
|
60 |
-
"fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/
|
61 |
-
"perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/
|
62 |
-
"rwitz/go-bruins-v2": "https://huggingface.co/spaces/
|
63 |
-
"rwitz/go-bruins": "https://huggingface.co/spaces/
|
64 |
-
"Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/
|
65 |
-
"aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/
|
66 |
-
"NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/
|
67 |
-
"Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/
|
68 |
-
"OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/
|
69 |
-
"perlthoughts/Falkor-7b": "https://huggingface.co/spaces/
|
70 |
-
"v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/
|
71 |
-
"Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/
|
72 |
-
"DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/
|
73 |
-
"PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/
|
74 |
-
"Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/
|
75 |
-
"Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/
|
76 |
-
"perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/
|
77 |
-
"elinas/chronos007-70b": "https://huggingface.co/spaces/
|
78 |
-
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/
|
79 |
-
"Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/
|
80 |
-
"diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/
|
81 |
-
"Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/
|
82 |
-
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/
|
83 |
-
"Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/
|
84 |
-
"garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/
|
85 |
-
"Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/
|
86 |
-
"uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/
|
87 |
-
"DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/
|
88 |
-
"cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/
|
89 |
-
"DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/
|
90 |
-
"DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/
|
91 |
-
"gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/
|
92 |
-
"udkai/Turdus": "https://huggingface.co/spaces/
|
93 |
-
"kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/
|
94 |
-
"kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/
|
95 |
-
"Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/
|
96 |
-
"mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/
|
97 |
-
"Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/
|
98 |
-
"ryandt/MusingCaterpillar": "https://huggingface.co/spaces/
|
99 |
-
"Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/
|
100 |
-
"SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/
|
101 |
-
"bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/
|
102 |
-
"cookinai/OpenCM-14": "https://huggingface.co/spaces/
|
103 |
-
"bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/
|
104 |
-
"jan-hq/supermario-v2": "https://huggingface.co/spaces/
|
105 |
# MoErges
|
106 |
-
"cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/
|
107 |
-
"cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/
|
108 |
-
"gagan3012/MetaModel_moe": "https://huggingface.co/spaces/
|
109 |
-
"macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/
|
110 |
-
"cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/
|
111 |
-
"macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/
|
112 |
-
"macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/
|
113 |
-
"macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/
|
114 |
-
"cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/
|
115 |
-
"macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/
|
116 |
-
"macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/
|
117 |
# Other - contamination mostly
|
118 |
-
"DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/
|
119 |
-
"CultriX/MistralTrix-v1": "https://huggingface.co/spaces/
|
120 |
-
"Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/
|
121 |
-
"Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/
|
122 |
}
|
123 |
|
124 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
@@ -167,6 +167,18 @@ def remove_forbidden_models(leaderboard_data: list[dict]):
|
|
167 |
leaderboard_data.pop(ix)
|
168 |
return leaderboard_data
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
|
171 |
def filter_models_flags(leaderboard_data: list[dict]):
|
172 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
|
|
5 |
# Models which have been flagged by users as being problematic for a reason or another
|
6 |
# (Model name to forum discussion link)
|
7 |
FLAGGED_MODELS = {
|
8 |
+
"merged": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
9 |
+
"Voicelab/trurl-2-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/202",
|
10 |
+
"deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/207",
|
11 |
+
"Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/213",
|
12 |
+
"Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/236",
|
13 |
+
"TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/237",
|
14 |
+
"gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/215",
|
15 |
+
"AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
16 |
+
"AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
17 |
+
"AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/287",
|
18 |
+
"fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/444",
|
19 |
+
"jan-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
20 |
+
"rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
21 |
+
"rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
22 |
+
"GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
23 |
+
"GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
24 |
+
"GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
25 |
+
"viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
26 |
+
"GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
27 |
+
"janai-hq/trinity-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
28 |
+
"ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
29 |
+
"fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
30 |
+
"mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
31 |
+
"mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
32 |
+
"Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
33 |
+
"GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
34 |
+
"quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
35 |
+
"quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
36 |
+
"quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
37 |
+
"mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
38 |
+
"cookinai/BruinHermes": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
39 |
+
"jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
40 |
+
"v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
41 |
+
"v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
42 |
+
"rwitz2/pee": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
43 |
+
"zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/503",
|
44 |
+
"dillfrescott/trinity-medium": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/474",
|
45 |
+
"udkai/Garrulus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/526",
|
46 |
"dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
|
47 |
+
"eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
48 |
+
"abideen/NexoNimbus-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
49 |
+
"alnrg2arg/test2_3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
50 |
+
"nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
51 |
+
"CultriX/MergeTrix-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
52 |
+
"liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/548",
|
53 |
# Merges not indicated
|
54 |
+
"gagan3012/MetaModelv2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
55 |
+
"gagan3012/MetaModelv3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
56 |
+
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
57 |
+
"kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
58 |
+
"kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
59 |
+
"kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
60 |
+
"fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
61 |
+
"perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
62 |
+
"rwitz/go-bruins-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
63 |
+
"rwitz/go-bruins": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
64 |
+
"Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
65 |
+
"aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
66 |
+
"NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
67 |
+
"Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
68 |
+
"OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
69 |
+
"perlthoughts/Falkor-7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
70 |
+
"v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
71 |
+
"Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
72 |
+
"DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
73 |
+
"PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
74 |
+
"Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
75 |
+
"Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
76 |
+
"perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
77 |
+
"elinas/chronos007-70b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
78 |
+
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
79 |
+
"Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
80 |
+
"diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
81 |
+
"Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
82 |
+
"Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
83 |
+
"Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
84 |
+
"garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
85 |
+
"Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
86 |
+
"uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/510",
|
87 |
+
"DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
88 |
+
"cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
89 |
+
"DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
90 |
+
"DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
91 |
+
"gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/511",
|
92 |
+
"udkai/Turdus": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
93 |
+
"kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
94 |
+
"kodonho/SolarM-SakuraSolar-SLERP": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
95 |
+
"Yhyu13/LMCocktail-10.7B-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
96 |
+
"mlabonne/NeuralMarcoro14-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
97 |
+
"Neuronovo/neuronovo-7B-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
98 |
+
"ryandt/MusingCaterpillar": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
99 |
+
"Neuronovo/neuronovo-7B-v0.3": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
100 |
+
"SanjiWatsuki/Lelantos-DPO-7B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
101 |
+
"bardsai/jaskier-7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
102 |
+
"cookinai/OpenCM-14": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
103 |
+
"bardsai/jaskier-7b-dpo-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
104 |
+
"jan-hq/supermario-v2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
105 |
# MoErges
|
106 |
+
"cloudyu/Yi-34Bx2-MoE-60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
107 |
+
"cloudyu/Mixtral_34Bx2_MoE_60B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
108 |
+
"gagan3012/MetaModel_moe": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
109 |
+
"macadeliccc/SOLAR-math-2x10.7b-v0.2": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
110 |
+
"cloudyu/Mixtral_7Bx2_MoE": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
111 |
+
"macadeliccc/SOLAR-math-2x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
112 |
+
"macadeliccc/Orca-SOLAR-4x10.7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
113 |
+
"macadeliccc/piccolo-8x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
114 |
+
"cloudyu/Mixtral_7Bx4_MOE_24B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
115 |
+
"macadeliccc/laser-dolphin-mixtral-2x7b-dpo": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
116 |
+
"macadeliccc/polyglot-math-4x7b": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/540",
|
117 |
# Other - contamination mostly
|
118 |
+
"DopeorNope/COKAL-v1-70B": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/566",
|
119 |
+
"CultriX/MistralTrix-v1": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/556",
|
120 |
+
"Contamination/contaminated_proof_7b_v1.0": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
|
121 |
+
"Contamination/contaminated_proof_7b_v1.0_safetensor": "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/664",
|
122 |
}
|
123 |
|
124 |
# Models which have been requested by orgs to not be submitted on the leaderboard
|
|
|
167 |
leaderboard_data.pop(ix)
|
168 |
return leaderboard_data
|
169 |
|
170 |
+
"""
|
171 |
+
def remove_forbidden_models(leaderboard_data):
|
172 |
+
#Removes models from the leaderboard based on the DO_NOT_SUBMIT list.
|
173 |
+
indices_to_remove = []
|
174 |
+
for ix, row in leaderboard_data.iterrows():
|
175 |
+
if row[AutoEvalColumn.fullname.name] in DO_NOT_SUBMIT_MODELS:
|
176 |
+
indices_to_remove.append(ix)
|
177 |
+
|
178 |
+
# Remove the models from the list
|
179 |
+
return leaderboard_data.drop(indices_to_remove)
|
180 |
+
"""
|
181 |
+
|
182 |
|
183 |
def filter_models_flags(leaderboard_data: list[dict]):
|
184 |
leaderboard_data = remove_forbidden_models(leaderboard_data)
|
src/leaderboard/read_evals.py
DELETED
@@ -1,261 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
from pathlib import Path
|
3 |
-
from json import JSONDecodeError
|
4 |
-
import logging
|
5 |
-
import math
|
6 |
-
|
7 |
-
from dataclasses import dataclass, field
|
8 |
-
from typing import Optional, Dict, List
|
9 |
-
|
10 |
-
from tqdm import tqdm
|
11 |
-
from tqdm.contrib.logging import logging_redirect_tqdm
|
12 |
-
|
13 |
-
import numpy as np
|
14 |
-
|
15 |
-
from src.display.formatting import make_clickable_model
|
16 |
-
from src.display.utils import AutoEvalColumn, ModelType, Precision, Tasks, WeightType, parse_datetime
|
17 |
-
|
18 |
-
# Configure logging
|
19 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
20 |
-
|
21 |
-
|
22 |
-
@dataclass
|
23 |
-
class EvalResult:
|
24 |
-
# Also see src.display.utils.AutoEvalColumn for what will be displayed.
|
25 |
-
eval_name: str # org_model_precision (uid)
|
26 |
-
full_model: str # org/model (path on hub)
|
27 |
-
org: Optional[str]
|
28 |
-
model: str
|
29 |
-
revision: str # commit hash, "" if main
|
30 |
-
results: Dict[str, float]
|
31 |
-
precision: Precision = Precision.Unknown
|
32 |
-
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
33 |
-
weight_type: WeightType = WeightType.Original
|
34 |
-
architecture: str = "Unknown" # From config file
|
35 |
-
license: str = "?"
|
36 |
-
likes: int = 0
|
37 |
-
num_params: int = 0
|
38 |
-
date: str = "" # submission date of request file
|
39 |
-
still_on_hub: bool = True
|
40 |
-
is_merge: bool = False
|
41 |
-
not_flagged: bool = False
|
42 |
-
status: str = "FINISHED"
|
43 |
-
# List of tags, initialized to a new empty list for each instance to avoid the pitfalls of mutable default arguments.
|
44 |
-
tags: List[str] = field(default_factory=list)
|
45 |
-
|
46 |
-
@classmethod
|
47 |
-
def init_from_json_file(cls, json_filepath: str) -> "EvalResult":
|
48 |
-
with open(json_filepath, "r") as fp:
|
49 |
-
data = json.load(fp)
|
50 |
-
|
51 |
-
config = data.get("config_general", {})
|
52 |
-
precision = Precision.from_str(config.get("model_dtype", "unknown"))
|
53 |
-
org_and_model = config.get("model_name", "").split("/", 1)
|
54 |
-
org = org_and_model[0] if len(org_and_model) > 1 else None
|
55 |
-
model = org_and_model[-1]
|
56 |
-
if len(org_and_model) == 1:
|
57 |
-
org = None
|
58 |
-
model = org_and_model[0]
|
59 |
-
result_key = f"{model}_{precision.value.name}"
|
60 |
-
else:
|
61 |
-
org = org_and_model[0]
|
62 |
-
model = org_and_model[1]
|
63 |
-
result_key = f"{org}_{model}_{precision.value.name}"
|
64 |
-
full_model = "/".join(org_and_model)
|
65 |
-
|
66 |
-
results = cls.extract_results(data) # Properly call the method to extract results
|
67 |
-
|
68 |
-
return cls(
|
69 |
-
eval_name=result_key,
|
70 |
-
full_model=full_model,
|
71 |
-
org=org,
|
72 |
-
model=model,
|
73 |
-
results=results,
|
74 |
-
precision=precision,
|
75 |
-
revision=config.get("model_sha", ""),
|
76 |
-
)
|
77 |
-
|
78 |
-
@staticmethod
|
79 |
-
def extract_results(data: Dict) -> Dict[str, float]:
|
80 |
-
"""
|
81 |
-
Extract and process benchmark results from a given dict.
|
82 |
-
|
83 |
-
Parameters:
|
84 |
-
- data (Dict): A dictionary containing benchmark data. This dictionary must
|
85 |
-
include 'versions' and 'results' keys with respective sub-data.
|
86 |
-
|
87 |
-
Returns:
|
88 |
-
- Dict[str, float]: A dictionary where keys are benchmark names and values
|
89 |
-
are the processed average scores as percentages.
|
90 |
-
|
91 |
-
Notes:
|
92 |
-
- The method specifically checks for certain benchmark names to skip outdated entries.
|
93 |
-
- Handles NaN values by setting the corresponding benchmark result to 0.0.
|
94 |
-
- Averages scores across metrics for benchmarks found in the data, in a percentage format.
|
95 |
-
"""
|
96 |
-
results = {}
|
97 |
-
for task in Tasks:
|
98 |
-
task = task.value
|
99 |
-
# We skip old mmlu entries
|
100 |
-
if task.benchmark == "hendrycksTest":
|
101 |
-
for mmlu_k in ["harness|hendrycksTest-abstract_algebra|5", "hendrycksTest-abstract_algebra"]:
|
102 |
-
if mmlu_k in data["versions"] and data["versions"][mmlu_k] == 0:
|
103 |
-
continue
|
104 |
-
|
105 |
-
# Some benchamrk values are NaNs, mostly truthfulQA
|
106 |
-
# Would be more optimal (without the whole dict itertion) if benchmark name was same as key in results
|
107 |
-
# e.g. not harness|truthfulqa:mc|0 but truthfulqa:mc
|
108 |
-
for k, v in data["results"].items():
|
109 |
-
if task.benchmark in k:
|
110 |
-
if math.isnan(float(v[task.metric])):
|
111 |
-
results[task.benchmark] = 0.0
|
112 |
-
continue
|
113 |
-
|
114 |
-
# We average all scores of a given metric (mostly for mmlu)
|
115 |
-
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
|
116 |
-
if accs.size == 0 or any([acc is None for acc in accs]):
|
117 |
-
continue
|
118 |
-
|
119 |
-
mean_acc = np.mean(accs) * 100.0
|
120 |
-
results[task.benchmark] = mean_acc
|
121 |
-
|
122 |
-
return results
|
123 |
-
|
124 |
-
def update_with_request_file(self, requests_path):
|
125 |
-
"""Finds the relevant request file for the current model and updates info with it."""
|
126 |
-
try:
|
127 |
-
request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
128 |
-
if request_file is None:
|
129 |
-
logging.warning(f"No request file for {self.org}/{self.model}")
|
130 |
-
self.status = "FAILED"
|
131 |
-
return
|
132 |
-
|
133 |
-
with open(request_file, "r") as f:
|
134 |
-
request = json.load(f)
|
135 |
-
|
136 |
-
self.model_type = ModelType.from_str(request.get("model_type", "Unknown"))
|
137 |
-
self.weight_type = WeightType[request.get("weight_type", "Original")]
|
138 |
-
self.num_params = int(request.get("params", 0)) # Ensuring type safety
|
139 |
-
self.date = request.get("submitted_time", "")
|
140 |
-
self.architecture = request.get("architectures", "Unknown")
|
141 |
-
self.status = request.get("status", "FAILED")
|
142 |
-
|
143 |
-
except FileNotFoundError:
|
144 |
-
self.status = "FAILED"
|
145 |
-
logging.error(f"Request file: {request_file} not found for {self.org}/{self.model}")
|
146 |
-
except JSONDecodeError:
|
147 |
-
self.status = "FAILED"
|
148 |
-
logging.error(f"Error decoding JSON from the request file for {self.org}/{self.model}")
|
149 |
-
except KeyError as e:
|
150 |
-
self.status = "FAILED"
|
151 |
-
logging.error(f"Key error {e} in processing request file for {self.org}/{self.model}")
|
152 |
-
except Exception as e: # Catch-all for any other unexpected exceptions
|
153 |
-
self.status = "FAILED"
|
154 |
-
logging.error(f"Unexpected error {e} for {self.org}/{self.model}")
|
155 |
-
|
156 |
-
def update_with_dynamic_file_dict(self, file_dict):
|
157 |
-
"""Update object attributes based on the provided dictionary, with error handling for missing keys and type validation."""
|
158 |
-
# Default values set for optional or potentially missing keys.
|
159 |
-
self.license = file_dict.get("license", "?")
|
160 |
-
self.likes = int(file_dict.get("likes", 0)) # Ensure likes is treated as an integer
|
161 |
-
self.still_on_hub = file_dict.get("still_on_hub", False) # Default to False if key is missing
|
162 |
-
self.tags = file_dict.get("tags", [])
|
163 |
-
|
164 |
-
# Calculate `flagged` only if 'tags' is not empty and avoid calculating each time
|
165 |
-
self.not_flagged = not (any("flagged" in tag for tag in self.tags))
|
166 |
-
|
167 |
-
def to_dict(self):
|
168 |
-
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
169 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
170 |
-
data_dict = {
|
171 |
-
"eval_name": self.eval_name, # not a column, just a save name,
|
172 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
173 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
174 |
-
AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
175 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
176 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
177 |
-
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
178 |
-
AutoEvalColumn.fullname.name: self.full_model,
|
179 |
-
AutoEvalColumn.revision.name: self.revision,
|
180 |
-
AutoEvalColumn.average.name: average,
|
181 |
-
AutoEvalColumn.license.name: self.license,
|
182 |
-
AutoEvalColumn.likes.name: self.likes,
|
183 |
-
AutoEvalColumn.params.name: self.num_params,
|
184 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
185 |
-
AutoEvalColumn.merged.name: not ("merge" in self.tags if self.tags else False),
|
186 |
-
AutoEvalColumn.moe.name: not (
|
187 |
-
("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower()
|
188 |
-
),
|
189 |
-
AutoEvalColumn.not_flagged.name: self.not_flagged,
|
190 |
-
}
|
191 |
-
|
192 |
-
for task in Tasks:
|
193 |
-
data_dict[task.value.col_name] = self.results[task.value.benchmark]
|
194 |
-
|
195 |
-
return data_dict
|
196 |
-
|
197 |
-
|
198 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
199 |
-
"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
200 |
-
requests_path = Path(requests_path)
|
201 |
-
pattern = f"{model_name}_eval_request_*.json"
|
202 |
-
|
203 |
-
# Using pathlib to find files matching the pattern
|
204 |
-
request_files = list(requests_path.glob(pattern))
|
205 |
-
|
206 |
-
# Sort the files by name in descending order to mimic 'reverse=True'
|
207 |
-
request_files.sort(reverse=True)
|
208 |
-
|
209 |
-
# Select the correct request file based on 'status' and 'precision'
|
210 |
-
request_file = None
|
211 |
-
for request_file in request_files:
|
212 |
-
with request_file.open("r") as f:
|
213 |
-
req_content = json.load(f)
|
214 |
-
if req_content["status"] == "FINISHED" and req_content["precision"] == precision.split(".")[-1]:
|
215 |
-
request_file = str(request_file)
|
216 |
-
|
217 |
-
# Return empty string if no file found that matches criteria
|
218 |
-
return request_file
|
219 |
-
|
220 |
-
|
221 |
-
def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
|
222 |
-
"""From the path of the results folder root, extract all needed info for results"""
|
223 |
-
with open(dynamic_path) as f:
|
224 |
-
dynamic_data = json.load(f)
|
225 |
-
|
226 |
-
results_path = Path(results_path)
|
227 |
-
model_files = list(results_path.rglob("results_*.json"))
|
228 |
-
model_files.sort(key=lambda file: parse_datetime(file.stem.removeprefix("results_")))
|
229 |
-
|
230 |
-
eval_results = {}
|
231 |
-
# Wrap model_files iteration with tqdm for progress display
|
232 |
-
for model_result_filepath in tqdm(model_files, desc="Processing model files"):
|
233 |
-
# Creation of result
|
234 |
-
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
235 |
-
with logging_redirect_tqdm():
|
236 |
-
eval_result.update_with_request_file(requests_path)
|
237 |
-
|
238 |
-
if eval_result.full_model in dynamic_data:
|
239 |
-
eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
|
240 |
-
# Hardcoding because of gating problem
|
241 |
-
if any([org in eval_result.full_model for org in ["meta-llama/", "google/", "tiiuae/"]]):
|
242 |
-
eval_result.still_on_hub = True
|
243 |
-
|
244 |
-
# Store results of same eval together
|
245 |
-
eval_name = eval_result.eval_name
|
246 |
-
if eval_name in eval_results.keys():
|
247 |
-
eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
248 |
-
else:
|
249 |
-
eval_results[eval_name] = eval_result
|
250 |
-
|
251 |
-
results = []
|
252 |
-
for k, v in eval_results.items():
|
253 |
-
try:
|
254 |
-
if v.status == "FINISHED":
|
255 |
-
v.to_dict() # we test if the dict version is complete
|
256 |
-
results.append(v)
|
257 |
-
except KeyError as e:
|
258 |
-
logging.error(f"Error while checking model {k} {v.date} json, no key: {e}") # not all eval values present
|
259 |
-
continue
|
260 |
-
|
261 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import pathlib
|
2 |
import pandas as pd
|
|
|
3 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
4 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
5 |
from src.leaderboard.filter_models import filter_models_flags
|
6 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
7 |
from src.display.utils import load_json_data
|
8 |
|
9 |
|
@@ -39,14 +39,15 @@ def get_evaluation_queue_df(save_path, cols):
|
|
39 |
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
40 |
|
41 |
|
42 |
-
def get_leaderboard_df(
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
47 |
|
48 |
-
df = pd.DataFrame.from_records(
|
49 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
50 |
df = df[cols].round(decimals=2)
|
51 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
52 |
-
return
|
|
|
1 |
import pathlib
|
2 |
import pandas as pd
|
3 |
+
from datasets import Dataset
|
4 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
5 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
|
6 |
from src.leaderboard.filter_models import filter_models_flags
|
|
|
7 |
from src.display.utils import load_json_data
|
8 |
|
9 |
|
|
|
39 |
return tuple(pd.DataFrame(status_dfs[status], columns=cols) for status in ["FINISHED", "RUNNING", "PENDING"])
|
40 |
|
41 |
|
42 |
+
def get_leaderboard_df(leaderboard_dataset: Dataset, cols: list, benchmark_cols: list):
|
43 |
"""Retrieve and process leaderboard data."""
|
44 |
+
all_data_json = leaderboard_dataset.to_dict()
|
45 |
+
num_items = leaderboard_dataset.num_rows
|
46 |
+
all_data_json_list = [{k: all_data_json[k][ix] for k in all_data_json.keys()} for ix in range(num_items)]
|
47 |
+
filter_models_flags(all_data_json_list)
|
48 |
|
49 |
+
df = pd.DataFrame.from_records(all_data_json_list)
|
50 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
51 |
df = df[cols].round(decimals=2)
|
52 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
53 |
+
return df
|
src/scripts/update_all_request_files.py
DELETED
@@ -1,129 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
|
7 |
-
from src.envs import API, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_REPO, EVAL_REQUESTS_PATH, H4_TOKEN
|
8 |
-
from src.submission.check_validity import check_model_card, get_model_tags, is_model_on_hub
|
9 |
-
|
10 |
-
|
11 |
-
def update_one_model(model_id, data, models_on_the_hub):
|
12 |
-
# Model no longer on the hub at all
|
13 |
-
if model_id not in models_on_the_hub:
|
14 |
-
data["still_on_hub"] = False
|
15 |
-
data["likes"] = 0
|
16 |
-
data["downloads"] = 0
|
17 |
-
data["created_at"] = ""
|
18 |
-
data["tags"] = []
|
19 |
-
return data
|
20 |
-
|
21 |
-
# Grabbing model parameters
|
22 |
-
model_cfg = models_on_the_hub[model_id]
|
23 |
-
data["likes"] = model_cfg.likes
|
24 |
-
data["downloads"] = model_cfg.downloads
|
25 |
-
data["created_at"] = str(model_cfg.created_at)
|
26 |
-
data["license"] = model_cfg.card_data.license if model_cfg.card_data is not None else ""
|
27 |
-
|
28 |
-
# Grabbing model details
|
29 |
-
model_name = model_id
|
30 |
-
if model_cfg.card_data is not None and model_cfg.card_data.base_model is not None:
|
31 |
-
if isinstance(model_cfg.card_data.base_model, str):
|
32 |
-
model_name = model_cfg.card_data.base_model # for adapters, we look at the parent model
|
33 |
-
still_on_hub, _, _ = is_model_on_hub(
|
34 |
-
model_name=model_name,
|
35 |
-
revision=data.get("revision"),
|
36 |
-
trust_remote_code=True,
|
37 |
-
test_tokenizer=False,
|
38 |
-
token=H4_TOKEN,
|
39 |
-
)
|
40 |
-
# If the model doesn't have a model card or a license, we consider it's deleted
|
41 |
-
if still_on_hub:
|
42 |
-
try:
|
43 |
-
status, _, model_card = check_model_card(model_id)
|
44 |
-
if status is False:
|
45 |
-
still_on_hub = False
|
46 |
-
except Exception:
|
47 |
-
model_card = None
|
48 |
-
still_on_hub = False
|
49 |
-
data["still_on_hub"] = still_on_hub
|
50 |
-
|
51 |
-
tags = get_model_tags(model_card, model_id) if still_on_hub else []
|
52 |
-
|
53 |
-
data["tags"] = tags
|
54 |
-
return data
|
55 |
-
|
56 |
-
|
57 |
-
def update_models(file_path, models_on_the_hub):
|
58 |
-
"""
|
59 |
-
Search through all JSON files in the specified root folder and its subfolders,
|
60 |
-
and update the likes key in JSON dict from value of input dict
|
61 |
-
"""
|
62 |
-
seen_models = []
|
63 |
-
with open(file_path, "r") as f:
|
64 |
-
model_infos = json.load(f)
|
65 |
-
for model_id in model_infos.keys():
|
66 |
-
seen_models.append(model_id)
|
67 |
-
model_infos[model_id] = update_one_model(
|
68 |
-
model_id=model_id, data=model_infos[model_id], models_on_the_hub=models_on_the_hub
|
69 |
-
)
|
70 |
-
|
71 |
-
# If new requests files have been created since we started all this
|
72 |
-
# we grab them
|
73 |
-
all_models = []
|
74 |
-
try:
|
75 |
-
for ix, (root, _, files) in enumerate(os.walk(EVAL_REQUESTS_PATH)):
|
76 |
-
if ix == 0:
|
77 |
-
continue
|
78 |
-
for file in files:
|
79 |
-
if "eval_request" in file:
|
80 |
-
path = root.split("/")[-1] + "/" + file.split("_eval_request")[0]
|
81 |
-
all_models.append(path)
|
82 |
-
except Exception as e:
|
83 |
-
print(e)
|
84 |
-
pass
|
85 |
-
|
86 |
-
for model_id in all_models:
|
87 |
-
if model_id not in seen_models:
|
88 |
-
model_infos[model_id] = update_one_model(model_id=model_id, data={}, models_on_the_hub=models_on_the_hub)
|
89 |
-
|
90 |
-
with open(file_path, "w") as f:
|
91 |
-
json.dump(model_infos, f, indent=2)
|
92 |
-
|
93 |
-
|
94 |
-
def update_dynamic_files():
|
95 |
-
"""This will only update metadata for models already linked in the repo, not add missing ones."""
|
96 |
-
snapshot_download(
|
97 |
-
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
98 |
-
)
|
99 |
-
|
100 |
-
print("UPDATE_DYNAMIC: Loaded snapshot")
|
101 |
-
# Get models
|
102 |
-
start = time.time()
|
103 |
-
|
104 |
-
models = list(
|
105 |
-
API.list_models(
|
106 |
-
# filter=ModelFilter(task="text-generation"),
|
107 |
-
full=False,
|
108 |
-
cardData=True,
|
109 |
-
fetch_config=True,
|
110 |
-
)
|
111 |
-
)
|
112 |
-
id_to_model = {model.id: model for model in models}
|
113 |
-
|
114 |
-
print(f"UPDATE_DYNAMIC: Downloaded list of models in {time.time() - start:.2f} seconds")
|
115 |
-
|
116 |
-
start = time.time()
|
117 |
-
|
118 |
-
update_models(DYNAMIC_INFO_FILE_PATH, id_to_model)
|
119 |
-
|
120 |
-
print(f"UPDATE_DYNAMIC: updated in {time.time() - start:.2f} seconds")
|
121 |
-
|
122 |
-
API.upload_file(
|
123 |
-
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
124 |
-
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
125 |
-
repo_id=DYNAMIC_INFO_REPO,
|
126 |
-
repo_type="dataset",
|
127 |
-
commit_message="Daily request file update.",
|
128 |
-
)
|
129 |
-
print("UPDATE_DYNAMIC: pushed to hub")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/submission/check_validity.py
CHANGED
@@ -13,7 +13,7 @@ from src.envs import HAS_HIGHER_RATE_LIMIT
|
|
13 |
|
14 |
|
15 |
# ht to @Wauplin, thank you for the snippet!
|
16 |
-
# See https://huggingface.co/spaces/
|
17 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
18 |
# Returns operation status, and error message
|
19 |
try:
|
|
|
13 |
|
14 |
|
15 |
# ht to @Wauplin, thank you for the snippet!
|
16 |
+
# See https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/discussions/317
|
17 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
18 |
# Returns operation status, and error message
|
19 |
try:
|
src/submission/submit.py
CHANGED
@@ -2,16 +2,11 @@ import json
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
-
from huggingface_hub import snapshot_download
|
6 |
-
|
7 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
8 |
from src.envs import (
|
9 |
API,
|
10 |
-
DYNAMIC_INFO_FILE_PATH,
|
11 |
-
DYNAMIC_INFO_PATH,
|
12 |
-
DYNAMIC_INFO_REPO,
|
13 |
EVAL_REQUESTS_PATH,
|
14 |
-
|
15 |
QUEUE_REPO,
|
16 |
RATE_LIMIT_PERIOD,
|
17 |
RATE_LIMIT_QUOTA,
|
@@ -35,7 +30,6 @@ def add_new_eval(
|
|
35 |
base_model: str,
|
36 |
revision: str,
|
37 |
precision: str,
|
38 |
-
private: bool,
|
39 |
weight_type: str,
|
40 |
model_type: str,
|
41 |
):
|
@@ -80,7 +74,7 @@ def add_new_eval(
|
|
80 |
# Is the model on the hub?
|
81 |
if weight_type in ["Delta", "Adapter"]:
|
82 |
base_model_on_hub, error, _ = is_model_on_hub(
|
83 |
-
model_name=base_model, revision=revision, token=
|
84 |
)
|
85 |
if not base_model_on_hub:
|
86 |
return styled_error(f'Base model "{base_model}" {error}')
|
@@ -126,7 +120,6 @@ def add_new_eval(
|
|
126 |
"model": model,
|
127 |
"base_model": base_model,
|
128 |
"revision": model_info.sha, # force to use the exact model commit
|
129 |
-
"private": private,
|
130 |
"precision": precision,
|
131 |
"params": model_size,
|
132 |
"architectures": architecture,
|
@@ -154,7 +147,7 @@ def add_new_eval(
|
|
154 |
print("Creating eval file")
|
155 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
156 |
os.makedirs(OUT_DIR, exist_ok=True)
|
157 |
-
out_path = f"{OUT_DIR}/{model_path}
|
158 |
|
159 |
with open(out_path, "w") as f:
|
160 |
f.write(json.dumps(eval_entry))
|
@@ -168,26 +161,6 @@ def add_new_eval(
|
|
168 |
commit_message=f"Add {model} to eval queue",
|
169 |
)
|
170 |
|
171 |
-
# We want to grab the latest version of the submission file to not accidentally overwrite it
|
172 |
-
snapshot_download(
|
173 |
-
repo_id=DYNAMIC_INFO_REPO, local_dir=DYNAMIC_INFO_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
|
174 |
-
)
|
175 |
-
|
176 |
-
with open(DYNAMIC_INFO_FILE_PATH) as f:
|
177 |
-
all_supplementary_info = json.load(f)
|
178 |
-
|
179 |
-
all_supplementary_info[model] = supplementary_info
|
180 |
-
with open(DYNAMIC_INFO_FILE_PATH, "w") as f:
|
181 |
-
json.dump(all_supplementary_info, f, indent=2)
|
182 |
-
|
183 |
-
API.upload_file(
|
184 |
-
path_or_fileobj=DYNAMIC_INFO_FILE_PATH,
|
185 |
-
path_in_repo=DYNAMIC_INFO_FILE_PATH.split("/")[-1],
|
186 |
-
repo_id=DYNAMIC_INFO_REPO,
|
187 |
-
repo_type="dataset",
|
188 |
-
commit_message=f"Add {model} to dynamic info queue",
|
189 |
-
)
|
190 |
-
|
191 |
# Remove the local file
|
192 |
os.remove(out_path)
|
193 |
|
|
|
2 |
import os
|
3 |
from datetime import datetime, timezone
|
4 |
|
|
|
|
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
from src.envs import (
|
7 |
API,
|
|
|
|
|
|
|
8 |
EVAL_REQUESTS_PATH,
|
9 |
+
HF_TOKEN,
|
10 |
QUEUE_REPO,
|
11 |
RATE_LIMIT_PERIOD,
|
12 |
RATE_LIMIT_QUOTA,
|
|
|
30 |
base_model: str,
|
31 |
revision: str,
|
32 |
precision: str,
|
|
|
33 |
weight_type: str,
|
34 |
model_type: str,
|
35 |
):
|
|
|
74 |
# Is the model on the hub?
|
75 |
if weight_type in ["Delta", "Adapter"]:
|
76 |
base_model_on_hub, error, _ = is_model_on_hub(
|
77 |
+
model_name=base_model, revision=revision, token=HF_TOKEN, test_tokenizer=True
|
78 |
)
|
79 |
if not base_model_on_hub:
|
80 |
return styled_error(f'Base model "{base_model}" {error}')
|
|
|
120 |
"model": model,
|
121 |
"base_model": base_model,
|
122 |
"revision": model_info.sha, # force to use the exact model commit
|
|
|
123 |
"precision": precision,
|
124 |
"params": model_size,
|
125 |
"architectures": architecture,
|
|
|
147 |
print("Creating eval file")
|
148 |
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
149 |
os.makedirs(OUT_DIR, exist_ok=True)
|
150 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
151 |
|
152 |
with open(out_path, "w") as f:
|
153 |
f.write(json.dumps(eval_entry))
|
|
|
161 |
commit_message=f"Add {model} to eval queue",
|
162 |
)
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# Remove the local file
|
165 |
os.remove(out_path)
|
166 |
|
src/tools/collections.py
DELETED
@@ -1,76 +0,0 @@
|
|
1 |
-
import pandas as pd
|
2 |
-
from huggingface_hub import add_collection_item, delete_collection_item, get_collection, update_collection_item
|
3 |
-
from huggingface_hub.utils._errors import HfHubHTTPError
|
4 |
-
from pandas import DataFrame
|
5 |
-
|
6 |
-
from src.display.utils import AutoEvalColumn, ModelType
|
7 |
-
from src.envs import H4_TOKEN, PATH_TO_COLLECTION
|
8 |
-
|
9 |
-
# Specific intervals for the collections
|
10 |
-
intervals = {
|
11 |
-
"1B": pd.Interval(0, 1.5, closed="right"),
|
12 |
-
"3B": pd.Interval(2.5, 3.5, closed="neither"),
|
13 |
-
"7B": pd.Interval(6, 8, closed="neither"),
|
14 |
-
"13B": pd.Interval(10, 14, closed="neither"),
|
15 |
-
"30B": pd.Interval(25, 35, closed="neither"),
|
16 |
-
"65B": pd.Interval(60, 70, closed="neither"),
|
17 |
-
}
|
18 |
-
|
19 |
-
|
20 |
-
def _filter_by_type_and_size(df, model_type, size_interval):
|
21 |
-
"""Filter DataFrame by model type and parameter size interval."""
|
22 |
-
type_emoji = model_type.value.symbol[0]
|
23 |
-
filtered_df = df[df[AutoEvalColumn.model_type_symbol.name] == type_emoji]
|
24 |
-
params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
25 |
-
mask = params_column.apply(lambda x: x in size_interval)
|
26 |
-
return filtered_df.loc[mask]
|
27 |
-
|
28 |
-
|
29 |
-
def _add_models_to_collection(collection, models, model_type, size):
|
30 |
-
"""Add best models to the collection and update positions."""
|
31 |
-
cur_len_collection = len(collection.items)
|
32 |
-
for ix, model in enumerate(models, start=1):
|
33 |
-
try:
|
34 |
-
collection = add_collection_item(
|
35 |
-
PATH_TO_COLLECTION,
|
36 |
-
item_id=model,
|
37 |
-
item_type="model",
|
38 |
-
exists_ok=True,
|
39 |
-
note=f"Best {model_type.to_str(' ')} model of around {size} on the leaderboard today!",
|
40 |
-
token=H4_TOKEN,
|
41 |
-
)
|
42 |
-
# Ensure position is correct if item was added
|
43 |
-
if len(collection.items) > cur_len_collection:
|
44 |
-
item_object_id = collection.items[-1].item_object_id
|
45 |
-
update_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_object_id, position=ix)
|
46 |
-
cur_len_collection = len(collection.items)
|
47 |
-
break # assuming we only add the top model
|
48 |
-
except HfHubHTTPError:
|
49 |
-
continue
|
50 |
-
|
51 |
-
|
52 |
-
def update_collections(df: DataFrame):
|
53 |
-
"""Update collections by filtering and adding the best models."""
|
54 |
-
collection = get_collection(collection_slug=PATH_TO_COLLECTION, token=H4_TOKEN)
|
55 |
-
cur_best_models = []
|
56 |
-
|
57 |
-
for model_type in ModelType:
|
58 |
-
if not model_type.value.name:
|
59 |
-
continue
|
60 |
-
for size, interval in intervals.items():
|
61 |
-
filtered_df = _filter_by_type_and_size(df, model_type, interval)
|
62 |
-
best_models = list(
|
63 |
-
filtered_df.sort_values(AutoEvalColumn.average.name, ascending=False)[AutoEvalColumn.fullname.name][:10]
|
64 |
-
)
|
65 |
-
print(model_type.value.symbol, size, best_models)
|
66 |
-
_add_models_to_collection(collection, best_models, model_type, size)
|
67 |
-
cur_best_models.extend(best_models)
|
68 |
-
|
69 |
-
# Cleanup
|
70 |
-
existing_models = {item.item_id for item in collection.items}
|
71 |
-
to_remove = existing_models - set(cur_best_models)
|
72 |
-
for item_id in to_remove:
|
73 |
-
try:
|
74 |
-
delete_collection_item(collection_slug=PATH_TO_COLLECTION, item_object_id=item_id, token=H4_TOKEN)
|
75 |
-
except HfHubHTTPError:
|
76 |
-
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/{scripts → tools}/create_request_file.py
RENAMED
File without changes
|
src/tools/model_backlinks.py
CHANGED
@@ -630,7 +630,7 @@ models = [
|
|
630 |
"WizardLM/WizardMath-7B-V1.0",
|
631 |
"Norquinal/llama-2-7b-claude-chat",
|
632 |
"TheTravellingEngineer/llama2-7b-chat-hf-dpo",
|
633 |
-
"
|
634 |
"joehuangx/spatial-vicuna-7b-v1.5-LoRA",
|
635 |
"conceptofmind/LLongMA-2-13b-16k",
|
636 |
"tianyil1/denas-llama2",
|
@@ -1039,7 +1039,7 @@ models = [
|
|
1039 |
"bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
|
1040 |
"EleutherAI/gpt-neo-2.7B",
|
1041 |
"danielhanchen/open_llama_3b_600bt_preview",
|
1042 |
-
"
|
1043 |
"pythainlp/wangchanglm-7.5B-sft-en-sharded",
|
1044 |
"beaugogh/pythia-1.4b-deduped-sharegpt",
|
1045 |
"HWERI/pythia-1.4b-deduped-sharegpt",
|
|
|
630 |
"WizardLM/WizardMath-7B-V1.0",
|
631 |
"Norquinal/llama-2-7b-claude-chat",
|
632 |
"TheTravellingEngineer/llama2-7b-chat-hf-dpo",
|
633 |
+
"open-llm-leaderboard/starchat-beta",
|
634 |
"joehuangx/spatial-vicuna-7b-v1.5-LoRA",
|
635 |
"conceptofmind/LLongMA-2-13b-16k",
|
636 |
"tianyil1/denas-llama2",
|
|
|
1039 |
"bhenrym14/airoboros-33b-gpt4-1.4.1-PI-8192-fp16",
|
1040 |
"EleutherAI/gpt-neo-2.7B",
|
1041 |
"danielhanchen/open_llama_3b_600bt_preview",
|
1042 |
+
"open-llm-leaderboard/starchat-alpha",
|
1043 |
"pythainlp/wangchanglm-7.5B-sft-en-sharded",
|
1044 |
"beaugogh/pythia-1.4b-deduped-sharegpt",
|
1045 |
"HWERI/pythia-1.4b-deduped-sharegpt",
|
src/tools/plots.py
CHANGED
@@ -6,10 +6,9 @@ from plotly.graph_objs import Figure
|
|
6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
9 |
-
from src.leaderboard.read_evals import EvalResult
|
10 |
|
11 |
|
12 |
-
def create_scores_df(
|
13 |
"""
|
14 |
Generates a DataFrame containing the maximum scores until each date.
|
15 |
|
@@ -17,8 +16,7 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
17 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
18 |
"""
|
19 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
20 |
-
results_df = pd.
|
21 |
-
# results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
22 |
results_df.sort_values(by="date", inplace=True)
|
23 |
|
24 |
# Step 2: Initialize the scores dictionary
|
@@ -30,22 +28,18 @@ def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame:
|
|
30 |
last_date = ""
|
31 |
column = task.col_name
|
32 |
for _, row in results_df.iterrows():
|
33 |
-
current_model = row[
|
34 |
# We ignore models that are flagged/no longer on the hub/not finished
|
35 |
to_ignore = (
|
36 |
-
not row[
|
37 |
-
or not row[
|
38 |
or current_model in FLAGGED_MODELS
|
39 |
-
or row["status"] != "FINISHED"
|
40 |
)
|
41 |
if to_ignore:
|
42 |
continue
|
43 |
|
44 |
-
current_date = row[
|
45 |
-
|
46 |
-
current_score = np.mean(list(row["results"].values()))
|
47 |
-
else:
|
48 |
-
current_score = row["results"][task.benchmark]
|
49 |
|
50 |
if current_score > current_max:
|
51 |
if current_date == last_date and len(scores[column]) > 0:
|
|
|
6 |
from src.display.utils import BENCHMARK_COLS, AutoEvalColumn, Task, Tasks
|
7 |
from src.display.utils import human_baseline_row as HUMAN_BASELINE
|
8 |
from src.leaderboard.filter_models import FLAGGED_MODELS
|
|
|
9 |
|
10 |
|
11 |
+
def create_scores_df(results_df: list[dict]) -> pd.DataFrame:
|
12 |
"""
|
13 |
Generates a DataFrame containing the maximum scores until each date.
|
14 |
|
|
|
16 |
:return: A new DataFrame containing the maximum scores until each date for every metric.
|
17 |
"""
|
18 |
# Step 1: Ensure 'date' is in datetime format and sort the DataFrame by it
|
19 |
+
results_df["date"] = pd.to_datetime(results_df["date"], format="mixed", utc=True)
|
|
|
20 |
results_df.sort_values(by="date", inplace=True)
|
21 |
|
22 |
# Step 2: Initialize the scores dictionary
|
|
|
28 |
last_date = ""
|
29 |
column = task.col_name
|
30 |
for _, row in results_df.iterrows():
|
31 |
+
current_model = row[AutoEvalColumn.fullname.name]
|
32 |
# We ignore models that are flagged/no longer on the hub/not finished
|
33 |
to_ignore = (
|
34 |
+
not row[AutoEvalColumn.still_on_hub.name]
|
35 |
+
or not row[AutoEvalColumn.not_flagged.name]
|
36 |
or current_model in FLAGGED_MODELS
|
|
|
37 |
)
|
38 |
if to_ignore:
|
39 |
continue
|
40 |
|
41 |
+
current_date = row[AutoEvalColumn.date.name]
|
42 |
+
current_score = row[task.col_name]
|
|
|
|
|
|
|
43 |
|
44 |
if current_score > current_max:
|
45 |
if current_date == last_date and len(scores[column]) > 0:
|