Spaces:
AIR-Bench
/
Running on CPU Upgrade

nan commited on
Commit
57ca843
1 Parent(s): 6855ddb

chore: clean up app.py

Browse files
Files changed (4) hide show
  1. app.py +22 -122
  2. requirements.txt +2 -2
  3. src/envs.py +3 -3
  4. src/populate.py +3 -2
app.py CHANGED
@@ -1,13 +1,9 @@
1
- import subprocess
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
@@ -17,40 +13,40 @@ from src.display.utils import (
17
  BENCHMARK_COLS,
18
  COLS,
19
  EVAL_COLS,
20
- EVAL_TYPES,
21
  NUMERIC_INTERVALS,
22
  TYPES,
23
  AutoEvalColumn,
24
  ModelType,
25
  fields,
26
- WeightType,
27
  Precision
28
  )
29
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
30
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
31
- from src.submission.submit import add_new_eval
32
 
33
 
34
  def restart_space():
35
  API.restart_space(repo_id=REPO_ID)
36
 
 
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
41
  )
42
  except Exception:
43
  restart_space()
44
  try:
45
  print(EVAL_RESULTS_PATH)
46
  snapshot_download(
47
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
48
  )
49
  except Exception:
50
  restart_space()
51
 
52
-
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
  leaderboard_df = original_df.copy()
55
 
56
  (
@@ -62,13 +58,13 @@ leaderboard_df = original_df.copy()
62
 
63
  # Searching and filtering
64
  def update_table(
65
- hidden_df: pd.DataFrame,
66
- columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
- query: str,
72
  ):
73
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
  filtered_df = filter_queries(query, filtered_df)
@@ -87,8 +83,8 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
87
  ]
88
  # We use COLS to maintain sorting
89
  filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
- ]
92
  return filtered_df
93
 
94
 
@@ -112,7 +108,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
112
 
113
 
114
  def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
  ) -> pd.DataFrame:
117
  # Show all models
118
  if show_deleted:
@@ -168,7 +164,7 @@ with demo:
168
  value=False, label="Show gated/private/deleted models", interactive=True
169
  )
170
  with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
  filter_columns_type = gr.CheckboxGroup(
173
  label="Model types",
174
  choices=[t.to_str() for t in ModelType],
@@ -195,7 +191,7 @@ with demo:
195
  value=leaderboard_df[
196
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
  + shown_columns.value
198
- ],
199
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
201
  elem_id="leaderboard-table",
@@ -223,7 +219,8 @@ with demo:
223
  ],
224
  leaderboard_table,
225
  )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
 
227
  selector.change(
228
  update_table,
229
  [
@@ -242,104 +239,7 @@ with demo:
242
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
244
 
245
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
246
- with gr.Column():
247
- with gr.Row():
248
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
249
-
250
- with gr.Column():
251
- with gr.Accordion(
252
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
253
- open=False,
254
- ):
255
- with gr.Row():
256
- finished_eval_table = gr.components.Dataframe(
257
- value=finished_eval_queue_df,
258
- headers=EVAL_COLS,
259
- datatype=EVAL_TYPES,
260
- row_count=5,
261
- )
262
- with gr.Accordion(
263
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
264
- open=False,
265
- ):
266
- with gr.Row():
267
- running_eval_table = gr.components.Dataframe(
268
- value=running_eval_queue_df,
269
- headers=EVAL_COLS,
270
- datatype=EVAL_TYPES,
271
- row_count=5,
272
- )
273
-
274
- with gr.Accordion(
275
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
276
- open=False,
277
- ):
278
- with gr.Row():
279
- pending_eval_table = gr.components.Dataframe(
280
- value=pending_eval_queue_df,
281
- headers=EVAL_COLS,
282
- datatype=EVAL_TYPES,
283
- row_count=5,
284
- )
285
- with gr.Row():
286
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
287
-
288
- with gr.Row():
289
- with gr.Column():
290
- model_name_textbox = gr.Textbox(label="Model name")
291
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
292
- model_type = gr.Dropdown(
293
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
294
- label="Model type",
295
- multiselect=False,
296
- value=None,
297
- interactive=True,
298
- )
299
-
300
- with gr.Column():
301
- precision = gr.Dropdown(
302
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
303
- label="Precision",
304
- multiselect=False,
305
- value="float16",
306
- interactive=True,
307
- )
308
- weight_type = gr.Dropdown(
309
- choices=[i.value.name for i in WeightType],
310
- label="Weights type",
311
- multiselect=False,
312
- value="Original",
313
- interactive=True,
314
- )
315
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
316
-
317
- submit_button = gr.Button("Submit Eval")
318
- submission_result = gr.Markdown()
319
- submit_button.click(
320
- add_new_eval,
321
- [
322
- model_name_textbox,
323
- base_model_name_textbox,
324
- revision_name_textbox,
325
- precision,
326
- weight_type,
327
- model_type,
328
- ],
329
- submission_result,
330
- )
331
-
332
- with gr.Row():
333
- with gr.Accordion("📙 Citation", open=False):
334
- citation_button = gr.Textbox(
335
- value=CITATION_BUTTON_TEXT,
336
- label=CITATION_BUTTON_LABEL,
337
- lines=20,
338
- elem_id="citation-button",
339
- show_copy_button=True,
340
- )
341
-
342
  scheduler = BackgroundScheduler()
343
  scheduler.add_job(restart_space, "interval", seconds=1800)
344
  scheduler.start()
345
- demo.queue(default_concurrency_limit=40).launch()
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.about import (
 
 
 
7
  INTRODUCTION_TEXT,
8
  LLM_BENCHMARKS_TEXT,
9
  TITLE,
 
13
  BENCHMARK_COLS,
14
  COLS,
15
  EVAL_COLS,
 
16
  NUMERIC_INTERVALS,
17
  TYPES,
18
  AutoEvalColumn,
19
  ModelType,
20
  fields,
 
21
  Precision
22
  )
23
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
24
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
25
 
26
 
27
  def restart_space():
28
  API.restart_space(repo_id=REPO_ID)
29
 
30
+
31
  try:
32
  print(EVAL_REQUESTS_PATH)
33
  snapshot_download(
34
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
35
+ token=TOKEN
36
  )
37
  except Exception:
38
  restart_space()
39
  try:
40
  print(EVAL_RESULTS_PATH)
41
  snapshot_download(
42
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30,
43
+ token=TOKEN
44
  )
45
  except Exception:
46
  restart_space()
47
 
48
+ raw_data, original_df = get_leaderboard_df(
49
+ EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
50
  leaderboard_df = original_df.copy()
51
 
52
  (
 
58
 
59
  # Searching and filtering
60
  def update_table(
61
+ hidden_df: pd.DataFrame,
62
+ columns: list,
63
+ type_query: list,
64
+ precision_query: str,
65
+ size_query: list,
66
+ show_deleted: bool,
67
+ query: str,
68
  ):
69
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
70
  filtered_df = filter_queries(query, filtered_df)
 
83
  ]
84
  # We use COLS to maintain sorting
85
  filtered_df = df[
86
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns]
87
+ ]
88
  return filtered_df
89
 
90
 
 
108
 
109
 
110
  def filter_models(
111
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
112
  ) -> pd.DataFrame:
113
  # Show all models
114
  if show_deleted:
 
164
  value=False, label="Show gated/private/deleted models", interactive=True
165
  )
166
  with gr.Column(min_width=320):
167
+ # with gr.Box(elem_id="box-filter"):
168
  filter_columns_type = gr.CheckboxGroup(
169
  label="Model types",
170
  choices=[t.to_str() for t in ModelType],
 
191
  value=leaderboard_df[
192
  [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
193
  + shown_columns.value
194
+ ],
195
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
196
  datatype=TYPES,
197
  elem_id="leaderboard-table",
 
219
  ],
220
  leaderboard_table,
221
  )
222
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
223
+ deleted_models_visibility]:
224
  selector.change(
225
  update_table,
226
  [
 
239
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
240
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  scheduler = BackgroundScheduler()
243
  scheduler.add_job(restart_space, "interval", seconds=1800)
244
  scheduler.start()
245
+ demo.queue(default_concurrency_limit=40).launch()
requirements.txt CHANGED
@@ -13,6 +13,6 @@ requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
  tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
- sentencepiece
 
 
13
  tqdm==4.65.0
14
  transformers==4.35.2
15
  tokenizers>=0.15.0
 
16
  accelerate==0.24.1
17
+ sentencepiece
18
+ socksio==1.0.0
src/envs.py CHANGED
@@ -4,9 +4,9 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
@@ -14,7 +14,7 @@ QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "nan" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/populate.py CHANGED
@@ -5,10 +5,11 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
 
9
 
10
 
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult
9
+ from typing import Tuple
10
 
11
 
12
+ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> Tuple[list[EvalResult], pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  all_data_json = [v.to_dict() for v in raw_data]