Spaces:
Running
Running
Commit
Β·
319bc4a
1
Parent(s):
582aadb
Update Space New
Browse files- README.md +2 -1
- app.py +49 -49
- requirements.txt +3 -5
- src/about.py +4 -2
- src/display/utils.py +1 -1
- src/envs.py +3 -3
- src/leaderboard/read_evals.py +60 -62
- src/populate.py +0 -2
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: π₯
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
@@ -7,6 +7,7 @@ sdk: gradio
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
|
|
10 |
---
|
11 |
|
12 |
# Start the configuration
|
|
|
1 |
---
|
2 |
+
title: Alignment
|
3 |
emoji: π₯
|
4 |
colorFrom: green
|
5 |
colorTo: indigo
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
license: apache-2.0
|
10 |
+
short_description: alignment leader board
|
11 |
---
|
12 |
|
13 |
# Start the configuration
|
app.py
CHANGED
@@ -33,13 +33,13 @@ def restart_space():
|
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
### Space initialisation
|
36 |
-
try:
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
except Exception:
|
42 |
-
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
@@ -51,11 +51,11 @@ except Exception:
|
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
-
(
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
@@ -106,41 +106,41 @@ with demo:
|
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
-
with gr.Column():
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
with gr.Row():
|
145 |
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
146 |
|
@@ -199,6 +199,6 @@ with demo:
|
|
199 |
)
|
200 |
|
201 |
scheduler = BackgroundScheduler()
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=
|
203 |
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=
|
|
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
35 |
### Space initialisation
|
36 |
+
# try:
|
37 |
+
# print(EVAL_REQUESTS_PATH)
|
38 |
+
# snapshot_download(
|
39 |
+
# repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
+
# )
|
41 |
+
# except Exception:
|
42 |
+
# restart_space()
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
|
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
53 |
|
54 |
+
# (
|
55 |
+
# finished_eval_queue_df,
|
56 |
+
# running_eval_queue_df,
|
57 |
+
# pending_eval_queue_df,
|
58 |
+
# ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
|
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
|
109 |
+
# with gr.Column():
|
110 |
+
# with gr.Accordion(
|
111 |
+
# f"β
Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
+
# open=False,
|
113 |
+
# ):
|
114 |
+
# with gr.Row():
|
115 |
+
# finished_eval_table = gr.components.Dataframe(
|
116 |
+
# value=finished_eval_queue_df,
|
117 |
+
# headers=EVAL_COLS,
|
118 |
+
# datatype=EVAL_TYPES,
|
119 |
+
# row_count=5,
|
120 |
+
# )
|
121 |
+
# with gr.Accordion(
|
122 |
+
# f"π Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
+
# open=False,
|
124 |
+
# ):
|
125 |
+
# with gr.Row():
|
126 |
+
# running_eval_table = gr.components.Dataframe(
|
127 |
+
# value=running_eval_queue_df,
|
128 |
+
# headers=EVAL_COLS,
|
129 |
+
# datatype=EVAL_TYPES,
|
130 |
+
# row_count=5,
|
131 |
+
# )
|
132 |
+
|
133 |
+
# with gr.Accordion(
|
134 |
+
# f"β³ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
+
# open=False,
|
136 |
+
# ):
|
137 |
+
# with gr.Row():
|
138 |
+
# pending_eval_table = gr.components.Dataframe(
|
139 |
+
# value=pending_eval_queue_df,
|
140 |
+
# headers=EVAL_COLS,
|
141 |
+
# datatype=EVAL_TYPES,
|
142 |
+
# row_count=5,
|
143 |
+
# )
|
144 |
with gr.Row():
|
145 |
gr.Markdown("# βοΈβ¨ Submit your model here!", elem_classes="markdown-text")
|
146 |
|
|
|
199 |
)
|
200 |
|
201 |
scheduler = BackgroundScheduler()
|
202 |
+
scheduler.add_job(restart_space, "interval", seconds=24*3600)
|
203 |
scheduler.start()
|
204 |
+
demo.queue(default_concurrency_limit=1).launch()
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
|
2 |
black
|
3 |
datasets
|
4 |
gradio
|
@@ -6,11 +6,9 @@ gradio[oauth]
|
|
6 |
gradio_leaderboard==0.0.13
|
7 |
gradio_client
|
8 |
huggingface-hub>=0.18.0
|
9 |
-
matplotlib
|
10 |
numpy
|
11 |
pandas
|
12 |
python-dateutil
|
13 |
tqdm
|
14 |
-
|
15 |
-
|
16 |
-
sentencepiece
|
|
|
1 |
+
apscheduler
|
2 |
black
|
3 |
datasets
|
4 |
gradio
|
|
|
6 |
gradio_leaderboard==0.0.13
|
7 |
gradio_client
|
8 |
huggingface-hub>=0.18.0
|
|
|
9 |
numpy
|
10 |
pandas
|
11 |
python-dateutil
|
12 |
tqdm
|
13 |
+
sentencepiece
|
14 |
+
transformers
|
|
src/about.py
CHANGED
@@ -12,8 +12,10 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("task_name", "safty", "safty")
|
16 |
+
task1 = Task("task_name2", "fairness", "fairness")
|
17 |
+
task2 = Task("task_name3", "socail-norm", "socail-norm")
|
18 |
+
|
19 |
|
20 |
NUM_FEWSHOT = 0 # Change with your few shot
|
21 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
|
|
26 |
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
#Scores
|
29 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avrage", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
src/envs.py
CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "fatmerajabi11" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/alignment"
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/alignment_results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
-
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
@@ -57,14 +58,14 @@ class EvalResult:
|
|
57 |
result_key = f"{org}_{model}_{precision.value.name}"
|
58 |
full_model = "/".join(org_and_model)
|
59 |
|
60 |
-
still_on_hub, _, model_config = is_model_on_hub(
|
61 |
-
|
62 |
-
)
|
63 |
architecture = "?"
|
64 |
-
if model_config is not None:
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
|
69 |
# Extract results available in this file (some results are split in several files)
|
70 |
results = {}
|
@@ -78,7 +79,7 @@ class EvalResult:
|
|
78 |
|
79 |
mean_acc = np.mean(accs) * 100.0
|
80 |
results[task.benchmark] = mean_acc
|
81 |
-
|
82 |
return self(
|
83 |
eval_name=result_key,
|
84 |
full_model=full_model,
|
@@ -87,25 +88,26 @@ class EvalResult:
|
|
87 |
results=results,
|
88 |
precision=precision,
|
89 |
revision= config.get("model_sha", ""),
|
90 |
-
still_on_hub
|
|
|
91 |
architecture=architecture
|
92 |
)
|
93 |
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
@@ -132,51 +134,46 @@ class EvalResult:
|
|
132 |
return data_dict
|
133 |
|
134 |
|
135 |
-
def get_request_file_for_model(requests_path, model_name, precision):
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
|
156 |
|
157 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
model_result_filepaths = []
|
160 |
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
files = [files[-1]]
|
171 |
-
|
172 |
-
for file in files:
|
173 |
-
model_result_filepaths.append(os.path.join(root, file))
|
174 |
-
|
175 |
eval_results = {}
|
176 |
for model_result_filepath in model_result_filepaths:
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
-
eval_result.update_with_request_file(requests_path)
|
180 |
|
181 |
# Store results of same eval together
|
182 |
eval_name = eval_result.eval_name
|
@@ -192,5 +189,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
192 |
results.append(v)
|
193 |
except KeyError: # not all eval values present
|
194 |
continue
|
195 |
-
|
|
|
196 |
return results
|
|
|
1 |
import glob
|
2 |
import json
|
3 |
+
# import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
+
import shutil
|
7 |
|
8 |
import dateutil
|
9 |
import numpy as np
|
10 |
|
11 |
from src.display.formatting import make_clickable_model
|
12 |
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
13 |
+
# from src.submission.check_validity import is_model_on_hub
|
14 |
|
15 |
|
16 |
@dataclass
|
|
|
58 |
result_key = f"{org}_{model}_{precision.value.name}"
|
59 |
full_model = "/".join(org_and_model)
|
60 |
|
61 |
+
# still_on_hub, _, model_config = is_model_on_hub(
|
62 |
+
# full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
63 |
+
# )
|
64 |
architecture = "?"
|
65 |
+
# if model_config is not None:
|
66 |
+
# architectures = getattr(model_config, "architectures", None)
|
67 |
+
# if architectures:
|
68 |
+
# architecture = ";".join(architectures)
|
69 |
|
70 |
# Extract results available in this file (some results are split in several files)
|
71 |
results = {}
|
|
|
79 |
|
80 |
mean_acc = np.mean(accs) * 100.0
|
81 |
results[task.benchmark] = mean_acc
|
82 |
+
|
83 |
return self(
|
84 |
eval_name=result_key,
|
85 |
full_model=full_model,
|
|
|
88 |
results=results,
|
89 |
precision=precision,
|
90 |
revision= config.get("model_sha", ""),
|
91 |
+
# it use still_on_hub param where it comment in upper lines
|
92 |
+
still_on_hub=False,
|
93 |
architecture=architecture
|
94 |
)
|
95 |
|
96 |
+
# def update_with_request_file(self, requests_path):
|
97 |
+
# """Finds the relevant request file for the current model and updates info with it"""
|
98 |
+
# request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
99 |
+
|
100 |
+
# try:
|
101 |
+
# with open(request_file, "r") as f:
|
102 |
+
# request = json.load(f)
|
103 |
+
# self.model_type = ModelType.from_str(request.get("model_type", ""))
|
104 |
+
# self.weight_type = WeightType[request.get("weight_type", "Original")]
|
105 |
+
# self.license = request.get("license", "?")
|
106 |
+
# self.likes = request.get("likes", 0)
|
107 |
+
# self.num_params = request.get("params", 0)
|
108 |
+
# self.date = request.get("submitted_time", "")
|
109 |
+
# except Exception:
|
110 |
+
# print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
111 |
|
112 |
def to_dict(self):
|
113 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
|
|
134 |
return data_dict
|
135 |
|
136 |
|
137 |
+
# def get_request_file_for_model(requests_path, model_name, precision):
|
138 |
+
# """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
|
139 |
+
# request_files = os.path.join(
|
140 |
+
# requests_path,
|
141 |
+
# f"{model_name}_eval_request_*.json",
|
142 |
+
# )
|
143 |
+
# request_files = glob.glob(request_files)
|
144 |
+
|
145 |
+
# # Select correct request file (precision)
|
146 |
+
# request_file = ""
|
147 |
+
# request_files = sorted(request_files, reverse=True)
|
148 |
+
# for tmp_request_file in request_files:
|
149 |
+
# with open(tmp_request_file, "r") as f:
|
150 |
+
# req_content = json.load(f)
|
151 |
+
# if (
|
152 |
+
# req_content["status"] in ["FINISHED"]
|
153 |
+
# and req_content["precision"] == precision.split(".")[-1]
|
154 |
+
# ):
|
155 |
+
# request_file = tmp_request_file
|
156 |
+
# return request_file
|
157 |
|
158 |
|
159 |
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
160 |
"""From the path of the results folder root, extract all needed info for results"""
|
161 |
model_result_filepaths = []
|
162 |
|
163 |
+
files = glob.glob(os.path.join(results_path, "*.json"), recursive=True)
|
164 |
+
# Sort the files by date
|
165 |
+
try:
|
166 |
+
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
167 |
+
except dateutil.parser._parser.ParserError:
|
168 |
+
files = [files[-1]]
|
169 |
+
for file in files:
|
170 |
+
model_result_filepaths.append(file)
|
171 |
+
|
|
|
|
|
|
|
|
|
|
|
172 |
eval_results = {}
|
173 |
for model_result_filepath in model_result_filepaths:
|
174 |
# Creation of result
|
175 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
176 |
+
# eval_result.update_with_request_file(requests_path)
|
177 |
|
178 |
# Store results of same eval together
|
179 |
eval_name = eval_result.eval_name
|
|
|
189 |
results.append(v)
|
190 |
except KeyError: # not all eval values present
|
191 |
continue
|
192 |
+
|
193 |
+
shutil.rmtree(results_path)
|
194 |
return results
|
src/populate.py
CHANGED
@@ -12,11 +12,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
17 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
18 |
df = df[cols].round(decimals=2)
|
19 |
-
|
20 |
# filter out if any of the benchmarks have not been produced
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
16 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
17 |
df = df[cols].round(decimals=2)
|
|
|
18 |
# filter out if any of the benchmarks have not been produced
|
19 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
20 |
return df
|