pourbahman commited on
Commit
319bc4a
Β·
1 Parent(s): 582aadb

Update Space New

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Demo Leaderboard
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
@@ -7,6 +7,7 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
 
10
  ---
11
 
12
  # Start the configuration
 
1
  ---
2
+ title: Alignment
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
 
7
  app_file: app.py
8
  pinned: true
9
  license: apache-2.0
10
+ short_description: alignment leader board
11
  ---
12
 
13
  # Start the configuration
app.py CHANGED
@@ -33,13 +33,13 @@ def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
@@ -51,11 +51,11 @@ except Exception:
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -106,41 +106,41 @@ with demo:
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
  with gr.Row():
145
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
 
@@ -199,6 +199,6 @@ with demo:
199
  )
200
 
201
  scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
+ # try:
37
+ # print(EVAL_REQUESTS_PATH)
38
+ # snapshot_download(
39
+ # repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ # )
41
+ # except Exception:
42
+ # restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
 
54
+ # (
55
+ # finished_eval_queue_df,
56
+ # running_eval_queue_df,
57
+ # pending_eval_queue_df,
58
+ # ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
 
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
+ # with gr.Column():
110
+ # with gr.Accordion(
111
+ # f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ # open=False,
113
+ # ):
114
+ # with gr.Row():
115
+ # finished_eval_table = gr.components.Dataframe(
116
+ # value=finished_eval_queue_df,
117
+ # headers=EVAL_COLS,
118
+ # datatype=EVAL_TYPES,
119
+ # row_count=5,
120
+ # )
121
+ # with gr.Accordion(
122
+ # f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ # open=False,
124
+ # ):
125
+ # with gr.Row():
126
+ # running_eval_table = gr.components.Dataframe(
127
+ # value=running_eval_queue_df,
128
+ # headers=EVAL_COLS,
129
+ # datatype=EVAL_TYPES,
130
+ # row_count=5,
131
+ # )
132
+
133
+ # with gr.Accordion(
134
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ # open=False,
136
+ # ):
137
+ # with gr.Row():
138
+ # pending_eval_table = gr.components.Dataframe(
139
+ # value=pending_eval_queue_df,
140
+ # headers=EVAL_COLS,
141
+ # datatype=EVAL_TYPES,
142
+ # row_count=5,
143
+ # )
144
  with gr.Row():
145
  gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
 
 
199
  )
200
 
201
  scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=24*3600)
203
  scheduler.start()
204
+ demo.queue(default_concurrency_limit=1).launch()
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- APScheduler
2
  black
3
  datasets
4
  gradio
@@ -6,11 +6,9 @@ gradio[oauth]
6
  gradio_leaderboard==0.0.13
7
  gradio_client
8
  huggingface-hub>=0.18.0
9
- matplotlib
10
  numpy
11
  pandas
12
  python-dateutil
13
  tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
 
1
+ apscheduler
2
  black
3
  datasets
4
  gradio
 
6
  gradio_leaderboard==0.0.13
7
  gradio_client
8
  huggingface-hub>=0.18.0
 
9
  numpy
10
  pandas
11
  python-dateutil
12
  tqdm
13
+ sentencepiece
14
+ transformers
 
src/about.py CHANGED
@@ -12,8 +12,10 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("task_name", "safty", "safty")
16
+ task1 = Task("task_name2", "fairness", "fairness")
17
+ task2 = Task("task_name3", "socail-norm", "socail-norm")
18
+
19
 
20
  NUM_FEWSHOT = 0 # Change with your few shot
21
  # ---------------------------------------------------
src/display/utils.py CHANGED
@@ -26,7 +26,7 @@ auto_eval_column_dict = []
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
 
26
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  #Scores
29
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avrage", "number", True)])
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "fatmerajabi11" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/alignment"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/alignment_results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -1,15 +1,16 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
 
6
 
7
  import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
@@ -57,14 +58,14 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
  architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
 
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
@@ -78,7 +79,7 @@ class EvalResult:
78
 
79
  mean_acc = np.mean(accs) * 100.0
80
  results[task.benchmark] = mean_acc
81
-
82
  return self(
83
  eval_name=result_key,
84
  full_model=full_model,
@@ -87,25 +88,26 @@ class EvalResult:
87
  results=results,
88
  precision=precision,
89
  revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
 
91
  architecture=architecture
92
  )
93
 
94
- def update_with_request_file(self, requests_path):
95
- """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
97
-
98
- try:
99
- with open(request_file, "r") as f:
100
- request = json.load(f)
101
- self.model_type = ModelType.from_str(request.get("model_type", ""))
102
- self.weight_type = WeightType[request.get("weight_type", "Original")]
103
- self.license = request.get("license", "?")
104
- self.likes = request.get("likes", 0)
105
- self.num_params = request.get("params", 0)
106
- self.date = request.get("submitted_time", "")
107
- except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -132,51 +134,46 @@ class EvalResult:
132
  return data_dict
133
 
134
 
135
- def get_request_file_for_model(requests_path, model_name, precision):
136
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
- request_files = os.path.join(
138
- requests_path,
139
- f"{model_name}_eval_request_*.json",
140
- )
141
- request_files = glob.glob(request_files)
142
-
143
- # Select correct request file (precision)
144
- request_file = ""
145
- request_files = sorted(request_files, reverse=True)
146
- for tmp_request_file in request_files:
147
- with open(tmp_request_file, "r") as f:
148
- req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
- request_file = tmp_request_file
154
- return request_file
155
 
156
 
157
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
161
- for root, _, files in os.walk(results_path):
162
- # We should only have json files in model results
163
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
164
- continue
165
-
166
- # Sort the files by date
167
- try:
168
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
169
- except dateutil.parser._parser.ParserError:
170
- files = [files[-1]]
171
-
172
- for file in files:
173
- model_result_filepaths.append(os.path.join(root, file))
174
-
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
@@ -192,5 +189,6 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
195
-
 
196
  return results
 
1
  import glob
2
  import json
3
+ # import math
4
  import os
5
  from dataclasses import dataclass
6
+ import shutil
7
 
8
  import dateutil
9
  import numpy as np
10
 
11
  from src.display.formatting import make_clickable_model
12
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
13
+ # from src.submission.check_validity import is_model_on_hub
14
 
15
 
16
  @dataclass
 
58
  result_key = f"{org}_{model}_{precision.value.name}"
59
  full_model = "/".join(org_and_model)
60
 
61
+ # still_on_hub, _, model_config = is_model_on_hub(
62
+ # full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
63
+ # )
64
  architecture = "?"
65
+ # if model_config is not None:
66
+ # architectures = getattr(model_config, "architectures", None)
67
+ # if architectures:
68
+ # architecture = ";".join(architectures)
69
 
70
  # Extract results available in this file (some results are split in several files)
71
  results = {}
 
79
 
80
  mean_acc = np.mean(accs) * 100.0
81
  results[task.benchmark] = mean_acc
82
+
83
  return self(
84
  eval_name=result_key,
85
  full_model=full_model,
 
88
  results=results,
89
  precision=precision,
90
  revision= config.get("model_sha", ""),
91
+ # it use still_on_hub param where it comment in upper lines
92
+ still_on_hub=False,
93
  architecture=architecture
94
  )
95
 
96
+ # def update_with_request_file(self, requests_path):
97
+ # """Finds the relevant request file for the current model and updates info with it"""
98
+ # request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
99
+
100
+ # try:
101
+ # with open(request_file, "r") as f:
102
+ # request = json.load(f)
103
+ # self.model_type = ModelType.from_str(request.get("model_type", ""))
104
+ # self.weight_type = WeightType[request.get("weight_type", "Original")]
105
+ # self.license = request.get("license", "?")
106
+ # self.likes = request.get("likes", 0)
107
+ # self.num_params = request.get("params", 0)
108
+ # self.date = request.get("submitted_time", "")
109
+ # except Exception:
110
+ # print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
111
 
112
  def to_dict(self):
113
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
134
  return data_dict
135
 
136
 
137
+ # def get_request_file_for_model(requests_path, model_name, precision):
138
+ # """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
139
+ # request_files = os.path.join(
140
+ # requests_path,
141
+ # f"{model_name}_eval_request_*.json",
142
+ # )
143
+ # request_files = glob.glob(request_files)
144
+
145
+ # # Select correct request file (precision)
146
+ # request_file = ""
147
+ # request_files = sorted(request_files, reverse=True)
148
+ # for tmp_request_file in request_files:
149
+ # with open(tmp_request_file, "r") as f:
150
+ # req_content = json.load(f)
151
+ # if (
152
+ # req_content["status"] in ["FINISHED"]
153
+ # and req_content["precision"] == precision.split(".")[-1]
154
+ # ):
155
+ # request_file = tmp_request_file
156
+ # return request_file
157
 
158
 
159
  def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
160
  """From the path of the results folder root, extract all needed info for results"""
161
  model_result_filepaths = []
162
 
163
+ files = glob.glob(os.path.join(results_path, "*.json"), recursive=True)
164
+ # Sort the files by date
165
+ try:
166
+ files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
167
+ except dateutil.parser._parser.ParserError:
168
+ files = [files[-1]]
169
+ for file in files:
170
+ model_result_filepaths.append(file)
171
+
 
 
 
 
 
172
  eval_results = {}
173
  for model_result_filepath in model_result_filepaths:
174
  # Creation of result
175
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
176
+ # eval_result.update_with_request_file(requests_path)
177
 
178
  # Store results of same eval together
179
  eval_name = eval_result.eval_name
 
189
  results.append(v)
190
  except KeyError: # not all eval values present
191
  continue
192
+
193
+ shutil.rmtree(results_path)
194
  return results
src/populate.py CHANGED
@@ -12,11 +12,9 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
-
16
  df = pd.DataFrame.from_records(all_data_json)
17
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
  df = df[cols].round(decimals=2)
19
-
20
  # filter out if any of the benchmarks have not been produced
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
 
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
17
  df = df[cols].round(decimals=2)
 
18
  # filter out if any of the benchmarks have not been produced
19
  df = df[has_no_nan_values(df, benchmark_cols)]
20
  return df