Spaces:
Running
Running
Commit
·
2e5b82a
1
Parent(s):
8964485
add mascot and starred bench from lighteval metatdata
Browse files
app.py
CHANGED
|
@@ -17,23 +17,6 @@ registry = Registry(custom_tasks=None, load_multilingual=True)
|
|
| 17 |
modules_data = registry.get_tasks_dump()
|
| 18 |
|
| 19 |
|
| 20 |
-
star_benchmarks = [
|
| 21 |
-
"aime",
|
| 22 |
-
"mmlu_pro",
|
| 23 |
-
"gpqa",
|
| 24 |
-
"hle",
|
| 25 |
-
"arc_agi_2",
|
| 26 |
-
"ifbench",
|
| 27 |
-
"ifeval",
|
| 28 |
-
"live code bench",
|
| 29 |
-
"math 500",
|
| 30 |
-
"mix_eval",
|
| 31 |
-
"musr",
|
| 32 |
-
"simpleqa",
|
| 33 |
-
"MMLU pro"
|
| 34 |
-
]
|
| 35 |
-
|
| 36 |
-
|
| 37 |
@dataclass
|
| 38 |
class TaskDoc:
|
| 39 |
module: str
|
|
@@ -44,6 +27,7 @@ class TaskDoc:
|
|
| 44 |
dataset: str | None
|
| 45 |
name: str | None = None
|
| 46 |
task_names: list[str] = field(default_factory=list)
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
def _module_to_github_path(module: str) -> str:
|
|
@@ -70,6 +54,7 @@ def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
|
|
| 70 |
tgs = [t.lower() for t in docstring.get("tags", [])]
|
| 71 |
paper = docstring.get("paper", "").strip() or None
|
| 72 |
name = docstring.get("name", "").strip() or None
|
|
|
|
| 73 |
|
| 74 |
# Convert dataset array to comma-separated string
|
| 75 |
dataset_list = docstring.get("dataset", [])
|
|
@@ -93,7 +78,8 @@ def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
|
|
| 93 |
paper=paper,
|
| 94 |
dataset=dataset,
|
| 95 |
name=name,
|
| 96 |
-
task_names=task_names
|
|
|
|
| 97 |
))
|
| 98 |
|
| 99 |
languages_sorted = [
|
|
@@ -112,33 +98,6 @@ def normalize_name_for_matching(name: str) -> str:
|
|
| 112 |
return re.sub(r"[_\s:]+", "", name.lower())
|
| 113 |
|
| 114 |
|
| 115 |
-
def is_starred_benchmark(td: TaskDoc) -> bool:
|
| 116 |
-
"""Check if task is a starred benchmark."""
|
| 117 |
-
module_parts = td.module.split(".")
|
| 118 |
-
base_no_ext = module_parts[-1] if module_parts else ""
|
| 119 |
-
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
|
| 120 |
-
|
| 121 |
-
task_name_raw = (td.name or "").lower().strip()
|
| 122 |
-
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
|
| 123 |
-
normalized_task_display = normalize_name_for_matching(task_name_display)
|
| 124 |
-
normalized_module = normalize_name_for_matching(base_no_ext)
|
| 125 |
-
normalized_name = normalize_name_for_matching(task_name_raw)
|
| 126 |
-
normalized_dataset = normalize_name_for_matching(td.dataset or "")
|
| 127 |
-
|
| 128 |
-
for star_name in star_benchmarks:
|
| 129 |
-
normalized_star = normalize_name_for_matching(star_name)
|
| 130 |
-
if (normalized_star == normalized_task_display or
|
| 131 |
-
normalized_star == normalized_module or
|
| 132 |
-
normalized_star == normalized_name or
|
| 133 |
-
normalized_star in normalized_task_display or
|
| 134 |
-
normalized_star in normalized_module or
|
| 135 |
-
(normalized_dataset and normalized_star in normalized_dataset) or
|
| 136 |
-
star_name.lower() in task_name_display or
|
| 137 |
-
star_name.lower() in base_no_ext.lower()):
|
| 138 |
-
return True
|
| 139 |
-
return False
|
| 140 |
-
|
| 141 |
-
|
| 142 |
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
|
| 143 |
"""Filter tasks by languages, tags, and search query."""
|
| 144 |
selected_langs = [lang.lower() for lang in (languages or [])]
|
|
@@ -157,7 +116,7 @@ def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[Tas
|
|
| 157 |
continue
|
| 158 |
out.append(td)
|
| 159 |
|
| 160 |
-
out.sort(key=lambda td: (not
|
| 161 |
return out
|
| 162 |
|
| 163 |
|
|
@@ -240,7 +199,7 @@ def render_cards(tasks: list[TaskDoc]) -> str:
|
|
| 240 |
if len(datasets) > 6:
|
| 241 |
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
|
| 242 |
dataset_html = " ".join(dataset_links) if dataset_links else ""
|
| 243 |
-
star_icon = "⭐ " if
|
| 244 |
|
| 245 |
# Display evaluation task names (max 3 visible, with dropdown for more)
|
| 246 |
# Group task names by prefix to collapse shared prefixes
|
|
@@ -697,6 +656,13 @@ with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
|
|
| 697 |
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
|
| 698 |
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
|
| 699 |
gr.Markdown("Tip: use the filters and search together. Results update live.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
with gr.Column(scale=5):
|
| 702 |
cards = gr.HTML()
|
|
|
|
| 17 |
modules_data = registry.get_tasks_dump()
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
@dataclass
|
| 21 |
class TaskDoc:
|
| 22 |
module: str
|
|
|
|
| 27 |
dataset: str | None
|
| 28 |
name: str | None = None
|
| 29 |
task_names: list[str] = field(default_factory=list)
|
| 30 |
+
starred: bool = False
|
| 31 |
|
| 32 |
|
| 33 |
def _module_to_github_path(module: str) -> str:
|
|
|
|
| 54 |
tgs = [t.lower() for t in docstring.get("tags", [])]
|
| 55 |
paper = docstring.get("paper", "").strip() or None
|
| 56 |
name = docstring.get("name", "").strip() or None
|
| 57 |
+
starred = docstring.get("starred", False)
|
| 58 |
|
| 59 |
# Convert dataset array to comma-separated string
|
| 60 |
dataset_list = docstring.get("dataset", [])
|
|
|
|
| 78 |
paper=paper,
|
| 79 |
dataset=dataset,
|
| 80 |
name=name,
|
| 81 |
+
task_names=task_names,
|
| 82 |
+
starred=starred
|
| 83 |
))
|
| 84 |
|
| 85 |
languages_sorted = [
|
|
|
|
| 98 |
return re.sub(r"[_\s:]+", "", name.lower())
|
| 99 |
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
|
| 102 |
"""Filter tasks by languages, tags, and search query."""
|
| 103 |
selected_langs = [lang.lower() for lang in (languages or [])]
|
|
|
|
| 116 |
continue
|
| 117 |
out.append(td)
|
| 118 |
|
| 119 |
+
out.sort(key=lambda td: (not td.starred, (td.name or td.module).lower()))
|
| 120 |
return out
|
| 121 |
|
| 122 |
|
|
|
|
| 199 |
if len(datasets) > 6:
|
| 200 |
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
|
| 201 |
dataset_html = " ".join(dataset_links) if dataset_links else ""
|
| 202 |
+
star_icon = "⭐ " if t.starred else ""
|
| 203 |
|
| 204 |
# Display evaluation task names (max 3 visible, with dropdown for more)
|
| 205 |
# Group task names by prefix to collapse shared prefixes
|
|
|
|
| 656 |
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
|
| 657 |
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
|
| 658 |
gr.Markdown("Tip: use the filters and search together. Results update live.")
|
| 659 |
+
gr.Image(
|
| 660 |
+
value="measuring_model_size.png",
|
| 661 |
+
label="",
|
| 662 |
+
show_label=False,
|
| 663 |
+
container=False,
|
| 664 |
+
show_download_button=False
|
| 665 |
+
)
|
| 666 |
|
| 667 |
with gr.Column(scale=5):
|
| 668 |
cards = gr.HTML()
|