Wwwduojin commited on
Commit
5848f7f
1 Parent(s): a312f2f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +414 -21
  2. requirements.txt +18 -0
app.py CHANGED
@@ -1,24 +1,417 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
-
3
- def submit_model(model_name, model_description):
4
- # 这里的逻辑是用来处理模型提交的。
5
- # 你需要将模型添加到评估队列,并返回提交状态。
6
- # 这个示例函数只是返回一个确认消息。
7
- return f"模型 '{model_name}' 已提交评估。描述: {model_description}"
8
-
9
- # 创建一个Gradio表单接口
10
- iface = gr.Interface(
11
- fn=submit_model, # 提交模型的函数
12
- inputs=[ # 输入字段
13
- gr.Textbox(label="模型名称"),
14
- gr.TextArea(label="模型描述")
15
- ],
16
- outputs=[ # 输出字段
17
- gr.Text(label="提交状态")
18
- ],
19
- title="Open MLLM Leaderboard", # 界面标题
20
- description="The Open LLM Leaderboard aims to track, rank, and evaluate LLMs and chatbots.", # 界面描述
21
- article="Submit a model for automated evaluation on the Open LLM Leaderboard on the 'Submit' page! The leaderboard's backend runs the great EleutherAI Language Model Evaluation Harness - read more details in the 'About' page!" # 底部的文章或额外信息
 
 
 
 
 
 
22
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ @Project -> File :leaderboard -> app.py.py
4
+ @Author : Www多金
5
+ @Date :2023/12/12 15:57
6
+ @Desc : study Mechine Learning
7
+ """
8
  import gradio as gr
9
+ import pandas as pd
10
+ from apscheduler.schedulers.background import BackgroundScheduler
11
+ from huggingface_hub import snapshot_download
12
+ from gradio_space_ci import configure_space_ci # FOR CI
13
+ from src.display.about import (
14
+ CITATION_BUTTON_LABEL,
15
+ CITATION_BUTTON_TEXT,
16
+ EVALUATION_QUEUE_TEXT,
17
+ INTRODUCTION_TEXT,
18
+ LLM_BENCHMARKS_TEXT,
19
+ FAQ_TEXT,
20
+ TITLE,
21
+ )
22
+ from src.display.css_html_js import custom_css
23
+ from src.display.utils import (
24
+ BENCHMARK_COLS,
25
+ COLS,
26
+ EVAL_COLS,
27
+ EVAL_TYPES,
28
+ NUMERIC_INTERVALS,
29
+ TYPES,
30
+ AutoEvalColumn,
31
+ ModelType,
32
+ fields,
33
+ WeightType,
34
+ Precision
35
  )
36
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
37
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
38
+ from src.submission.submit import add_new_eval
39
+ from src.tools.collections import update_collections
40
+ from src.tools.plots import (
41
+ create_metric_plot_obj,
42
+ create_plot_df,
43
+ create_scores_df,
44
+ )
45
+
46
+ def restart_space():
47
+ API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
48
+
49
+ try:
50
+ print(EVAL_REQUESTS_PATH)
51
+ snapshot_download(
52
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
53
+ )
54
+ except Exception:
55
+ restart_space()
56
+ try:
57
+ print(EVAL_RESULTS_PATH)
58
+ snapshot_download(
59
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
60
+ )
61
+ except Exception:
62
+ restart_space()
63
+
64
+ raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
65
+ update_collections(original_df.copy())
66
+ leaderboard_df = original_df.copy()
67
+
68
+ plot_df = create_plot_df(create_scores_df(raw_data))
69
+ (
70
+ finished_eval_queue_df,
71
+ running_eval_queue_df,
72
+ pending_eval_queue_df,
73
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
74
+
75
+ # Searching and filtering
76
+ def update_table(
77
+ hidden_df: pd.DataFrame,
78
+ columns: list,
79
+ type_query: list,
80
+ precision_query: str,
81
+ size_query: list,
82
+ show_deleted: bool,
83
+ query: str,
84
+ ):
85
+ filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
86
+ filtered_df = filter_queries(query, filtered_df)
87
+ df = select_columns(filtered_df, columns)
88
+ return df
89
+
90
+ def load_query(request: gr.Request): # triggered only once at startup => read query parameter if it exists
91
+ query = request.query_params.get("query") or ""
92
+ return query, query # return one for the "search_bar", one for a hidden component that triggers a reload only if value has changed
93
+
94
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
95
+ return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
96
+
97
+ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
98
+ always_here_cols = [
99
+ AutoEvalColumn.model_type_symbol.name,
100
+ AutoEvalColumn.model.name,
101
+ ]
102
+ # We use COLS to maintain sorting
103
+ filtered_df = df[
104
+ always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
105
+ ]
106
+ return filtered_df
107
+
108
+
109
+ def filter_queries(query: str, filtered_df: pd.DataFrame):
110
+ """Added by Abishek"""
111
+ final_df = []
112
+ if query != "":
113
+ queries = [q.strip() for q in query.split(";")]
114
+ for _q in queries:
115
+ _q = _q.strip()
116
+ if _q != "":
117
+ temp_filtered_df = search_table(filtered_df, _q)
118
+ if len(temp_filtered_df) > 0:
119
+ final_df.append(temp_filtered_df)
120
+ if len(final_df) > 0:
121
+ filtered_df = pd.concat(final_df)
122
+ filtered_df = filtered_df.drop_duplicates(
123
+ subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
124
+ )
125
+
126
+ return filtered_df
127
+
128
+ def filter_models(
129
+ df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
130
+ ) -> pd.DataFrame:
131
+ # Show all models
132
+ if show_deleted:
133
+ filtered_df = df
134
+ else: # Show only still on the hub models
135
+ filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
136
+
137
+ type_emoji = [t[0] for t in type_query]
138
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
139
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
140
+
141
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
142
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
143
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
144
+ filtered_df = filtered_df.loc[mask]
145
+
146
+ return filtered_df
147
+ # def restart_space():
148
+ # API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
149
+ demo = gr.Blocks(css=custom_css)
150
+ with demo:
151
+ gr.HTML(TITLE)
152
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
153
+
154
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
+ with gr.TabItem("🏅 MLLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
156
+ with gr.Row():
157
+ with gr.Column():
158
+ with gr.Row():
159
+ search_bar = gr.Textbox(
160
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
161
+ show_label=False,
162
+ elem_id="search-bar",
163
+ )
164
+ with gr.Row():
165
+ shown_columns = gr.CheckboxGroup(
166
+ choices=[
167
+ c.name
168
+ for c in fields(AutoEvalColumn)
169
+ if not c.hidden and not c.never_hidden and not c.dummy
170
+ ],
171
+ value=[
172
+ c.name
173
+ for c in fields(AutoEvalColumn)
174
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
175
+ ],
176
+ label="Select columns to show",
177
+ elem_id="column-select",
178
+ interactive=True,
179
+ )
180
+ with gr.Row():
181
+ deleted_models_visibility = gr.Checkbox(
182
+ value=False, label="Show gated/private/deleted models", interactive=True
183
+ )
184
+ with gr.Column(min_width=320):
185
+ # with gr.Box(elem_id="box-filter"):
186
+ filter_columns_type = gr.CheckboxGroup(
187
+ label="Model types",
188
+ choices=[t.to_str() for t in ModelType],
189
+ value=[t.to_str() for t in ModelType],
190
+ interactive=True,
191
+ elem_id="filter-columns-type",
192
+ )
193
+ filter_columns_precision = gr.CheckboxGroup(
194
+ label="Precision",
195
+ choices=[i.value.name for i in Precision],
196
+ value=[i.value.name for i in Precision],
197
+ interactive=True,
198
+ elem_id="filter-columns-precision",
199
+ )
200
+ filter_columns_size = gr.CheckboxGroup(
201
+ label="Model sizes (in billions of parameters)",
202
+ choices=list(NUMERIC_INTERVALS.keys()),
203
+ value=list(NUMERIC_INTERVALS.keys()),
204
+ interactive=True,
205
+ elem_id="filter-columns-size",
206
+ )
207
+
208
+ leaderboard_table = gr.components.Dataframe(
209
+ value=leaderboard_df[
210
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
211
+ + shown_columns.value
212
+ + [AutoEvalColumn.dummy.name]
213
+ ],
214
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
215
+ datatype=TYPES,
216
+ elem_id="leaderboard-table",
217
+ interactive=False,
218
+ visible=True,
219
+ column_widths=["2%", "33%"]
220
+ )
221
+
222
+ # Dummy leaderboard for handling the case when the user uses backspace key
223
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
224
+ value=original_df[COLS],
225
+ headers=COLS,
226
+ datatype=TYPES,
227
+ visible=False,
228
+ )
229
+ search_bar.submit(
230
+ update_table,
231
+ [
232
+ hidden_leaderboard_table_for_search,
233
+ shown_columns,
234
+ filter_columns_type,
235
+ filter_columns_precision,
236
+ filter_columns_size,
237
+ deleted_models_visibility,
238
+ search_bar,
239
+ ],
240
+ leaderboard_table,
241
+ )
242
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size,
243
+ deleted_models_visibility]:
244
+ selector.change(
245
+ update_table,
246
+ [
247
+ hidden_leaderboard_table_for_search,
248
+ shown_columns,
249
+ filter_columns_type,
250
+ filter_columns_precision,
251
+ filter_columns_size,
252
+ deleted_models_visibility,
253
+ search_bar,
254
+ ],
255
+ leaderboard_table,
256
+ queue=True,
257
+ )
258
+
259
+ # with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=4):
260
+ # with gr.Row():
261
+ # with gr.Column():
262
+ # chart = create_metric_plot_obj(
263
+ # plot_df,
264
+ # [AutoEvalColumn.average.name],
265
+ # title="Average of Top Scores and Human Baseline Over Time (from last update)",
266
+ # )
267
+ # gr.Plot(value=chart, min_width=500)
268
+ # with gr.Column():
269
+ # chart = create_metric_plot_obj(
270
+ # plot_df,
271
+ # BENCHMARK_COLS,
272
+ # title="Top Scores and Human Baseline Over Time (from last update)",
273
+ # )
274
+ # gr.Plot(value=chart, min_width=500)
275
+ # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
276
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
277
+ # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
278
+ #
279
+ # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
280
+ # with gr.Column():
281
+ # with gr.Row():
282
+ # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
283
+ #
284
+ # with gr.Column():
285
+ # with gr.Accordion(
286
+ # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
287
+ # open=False,
288
+ # ):
289
+ # with gr.Row():
290
+ # finished_eval_table = gr.components.Dataframe(
291
+ # value=finished_eval_queue_df,
292
+ # headers=EVAL_COLS,
293
+ # datatype=EVAL_TYPES,
294
+ # row_count=5,
295
+ # )
296
+ # with gr.Accordion(
297
+ # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
298
+ # open=False,
299
+ # ):
300
+ # with gr.Row():
301
+ # running_eval_table = gr.components.Dataframe(
302
+ # value=running_eval_queue_df,
303
+ # headers=EVAL_COLS,
304
+ # datatype=EVAL_TYPES,
305
+ # row_count=5,
306
+ # )
307
+ #
308
+ # with gr.Accordion(
309
+ # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
310
+ # open=False,
311
+ # ):
312
+ # with gr.Row():
313
+ # pending_eval_table = gr.components.Dataframe(
314
+ # value=pending_eval_queue_df,
315
+ # headers=EVAL_COLS,
316
+ # datatype=EVAL_TYPES,
317
+ # row_count=5,
318
+ # )
319
+ # with gr.Row():
320
+ # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
321
+ #
322
+ # with gr.Row():
323
+ # with gr.Column():
324
+ # model_name_textbox = gr.Textbox(label="Model name")
325
+ # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
326
+ # private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
327
+ # model_type = gr.Dropdown(
328
+ # choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
329
+ # label="Model type",
330
+ # multiselect=False,
331
+ # value=None,
332
+ # interactive=True,
333
+ # )
334
+ #
335
+ # with gr.Column():
336
+ # precision = gr.Dropdown(
337
+ # choices=[i.value.name for i in Precision if i != Precision.Unknown],
338
+ # label="Precision",
339
+ # multiselect=False,
340
+ # value="float16",
341
+ # interactive=True,
342
+ # )
343
+ # weight_type = gr.Dropdown(
344
+ # choices=[i.value.name for i in WeightType],
345
+ # label="Weights type",
346
+ # multiselect=False,
347
+ # value="Original",
348
+ # interactive=True,
349
+ # )
350
+ # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
351
+ #
352
+ # submit_button = gr.Button("Submit Eval")
353
+ # submission_result = gr.Markdown()
354
+ # submit_button.click(
355
+ # add_new_eval,
356
+ # [
357
+ # model_name_textbox,
358
+ # base_model_name_textbox,
359
+ # revision_name_textbox,
360
+ # precision,
361
+ # private,
362
+ # weight_type,
363
+ # model_type,
364
+ # ],
365
+ # submission_result,
366
+ # )
367
+
368
+ # with gr.Row():
369
+ # with gr.Accordion("📙 Citation", open=False):
370
+ # citation_button = gr.Textbox(
371
+ # value=CITATION_BUTTON_TEXT,
372
+ # label=CITATION_BUTTON_LABEL,
373
+ # lines=20,
374
+ # elem_id="citation-button",
375
+ # show_copy_button=True,
376
+ # )
377
+
378
+ scheduler = BackgroundScheduler()
379
+ scheduler.add_job(restart_space, "interval", seconds=1800)
380
+ scheduler.start()
381
+ # Both launches the space and its CI
382
+ configure_space_ci(
383
+ demo.queue(default_concurrency_limit=40),
384
+ # trusted_authors=[], # add manually trusted authors
385
+ # private="True", # ephemeral spaces will have same visibility as the main space. Otherwise, set to `True` or `False` explicitly.
386
+ # variables={}, # We overwrite HF_HOME as tmp CI spaces will have no cache
387
+ # secrets=[" HF_TOKEN", "H4_TOKEN"], # which secret do I want to copy from the main space? Can be a `List[str]`.
388
+ # hardware=None, # "cpu-basic" by default. Otherwise set to "auto" to have same hardware as the main space or any valid string value.
389
+ # storage=None, # no storage by default. Otherwise set to "auto" to have same storage as the main space or any valid string value.
390
+ ).launch()
391
+
392
 
393
+ # scheduler = BackgroundScheduler()
394
+ # scheduler.add_job(restart_space, "interval", seconds=1800)
395
+ # scheduler.start()
396
+ # def submit_model(model_name, model_description):
397
+ # # 这里的逻辑是用来处理模型提交的。
398
+ # # 你需要将模型添加到评估队列,并返回提交状态。
399
+ # # 这个示例函数只是返回一个确认消息。
400
+ # return f"模型 '{model_name}' 已提交评估。描述: {model_description}"
401
+ #
402
+ # # 创建一个Gradio表单接口
403
+ # iface = gr.Interface(
404
+ # fn=submit_model, # 提交模型的函数
405
+ # inputs=[ # 输入字段
406
+ # gr.Textbox(label="模型名称"),
407
+ # gr.TextArea(label="模型描述")
408
+ # ],
409
+ # outputs=[ # 输出字段
410
+ # gr.Text(label="提交状态")
411
+ # ],
412
+ # title="Open MLLM Leaderboard", # 界面标题
413
+ # description="The Open LLM Leaderboard aims to track, rank, and evaluate LLMs and chatbots.", # 界面描述
414
+ # article="Submit a model for automated evaluation on the Open LLM Leaderboard on the 'Submit' page! The leaderboard's backend runs the great EleutherAI Language Model Evaluation Harness - read more details in the 'About' page!" # 底部的文章或额外信息
415
+ # )
416
+ #
417
+ # iface.launch()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ APScheduler==3.10.1
2
+ black==23.11.0
3
+ click==8.1.3
4
+ datasets==2.14.5
5
+ gradio==4.4.0
6
+ gradio_client==0.7.0
7
+ huggingface-hub>=0.18.0
8
+ matplotlib==3.7.1
9
+ numpy==1.24.2
10
+ pandas==2.0.0
11
+ plotly==5.14.1
12
+ python-dateutil==2.8.2
13
+ requests==2.28.2
14
+ sentencepiece
15
+ tqdm==4.65.0
16
+ transformers==4.36.0
17
+ tokenizers>=0.15.0
18
+ gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci # CI !!!