Spaces:
Running
Running
burtenshaw
commited on
Commit
·
1c7c01e
1
Parent(s):
b5eec3d
update app to use lighteval format
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ from datasets import load_dataset
|
|
9 |
abs_path = Path(__file__).parent
|
10 |
submissions = json.load(open(abs_path / "submissions.json"))
|
11 |
|
12 |
-
TASKS = ["
|
13 |
TYPES = [
|
14 |
"markdown",
|
15 |
"markdown",
|
@@ -21,14 +21,45 @@ COLUMNS = ["User", "Model Name", "MMLU", "Average ⬆️", "Results"]
|
|
21 |
WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def load_submissions():
|
25 |
leaderboard = []
|
26 |
-
|
27 |
for submission in submissions["submissions"]:
|
28 |
ds = load_dataset(submission["results-dataset"], "results")
|
29 |
-
ds = ds.filter(lambda x: x["task"] in TASKS)
|
30 |
|
31 |
-
|
|
|
|
|
|
|
32 |
|
33 |
leaderboard_row = {}
|
34 |
|
@@ -40,11 +71,12 @@ def load_submissions():
|
|
40 |
f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
|
41 |
)
|
42 |
|
43 |
-
for result in
|
44 |
-
leaderboard_row[
|
45 |
-
all_accuracy.append(result["accuracy"])
|
46 |
|
47 |
-
leaderboard_row["Average ⬆️"] = sum(
|
|
|
|
|
48 |
|
49 |
leaderboard_row["results-dataset"] = (
|
50 |
f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
|
|
|
9 |
abs_path = Path(__file__).parent
|
10 |
submissions = json.load(open(abs_path / "submissions.json"))
|
11 |
|
12 |
+
TASKS = [("gsm8k", "lighteval|gsm8k|0", "extractive_match")]
|
13 |
TYPES = [
|
14 |
"markdown",
|
15 |
"markdown",
|
|
|
21 |
WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
|
22 |
|
23 |
|
24 |
+
def load_results(dataset):
|
25 |
+
results = []
|
26 |
+
|
27 |
+
try:
|
28 |
+
output = dataset["latest"]["results"]
|
29 |
+
output = output[-1]
|
30 |
+
except KeyError as e:
|
31 |
+
raise ValueError("Cannot find 'latest' key in the dataset")
|
32 |
+
|
33 |
+
try:
|
34 |
+
output = json.loads(output)
|
35 |
+
except ValueError as e:
|
36 |
+
raise ValueError("Cannot parse the output as JSON")
|
37 |
+
|
38 |
+
for name, task, metric in TASKS:
|
39 |
+
try:
|
40 |
+
output = output[task]
|
41 |
+
except KeyError as e:
|
42 |
+
raise ValueError(f"Cannot find '{task}' key in the dataset")
|
43 |
+
|
44 |
+
try:
|
45 |
+
output = (name, output[metric])
|
46 |
+
except KeyError as e:
|
47 |
+
raise ValueError("Cannot find 'extractive_match' key in the dataset")
|
48 |
+
|
49 |
+
results.append(output)
|
50 |
+
|
51 |
+
return results
|
52 |
+
|
53 |
+
|
54 |
def load_submissions():
|
55 |
leaderboard = []
|
|
|
56 |
for submission in submissions["submissions"]:
|
57 |
ds = load_dataset(submission["results-dataset"], "results")
|
|
|
58 |
|
59 |
+
try:
|
60 |
+
results = load_results(ds)
|
61 |
+
except ValueError as e:
|
62 |
+
raise ValueError(f"Cannot load results for {ds['results-dataset']}") from e
|
63 |
|
64 |
leaderboard_row = {}
|
65 |
|
|
|
71 |
f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
|
72 |
)
|
73 |
|
74 |
+
for name, result in results:
|
75 |
+
leaderboard_row[name] = result
|
|
|
76 |
|
77 |
+
leaderboard_row["Average ⬆️"] = sum(result for _, result in results) / len(
|
78 |
+
results
|
79 |
+
)
|
80 |
|
81 |
leaderboard_row["results-dataset"] = (
|
82 |
f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
|
docs.md
CHANGED
@@ -65,7 +65,14 @@ Open a pull request on the [leaderboard space](https://huggingface.co/spaces/smo
|
|
65 |
```json
|
66 |
{
|
67 |
"submissions": [
|
68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
... # existing submissions
|
70 |
|
71 |
{
|
|
|
65 |
```json
|
66 |
{
|
67 |
"submissions": [
|
68 |
+
{
|
69 |
+
"username": "HuggingFaceTB",
|
70 |
+
"model_name": "SmolLM3-3B",
|
71 |
+
"chapter": "1",
|
72 |
+
"submission_date": "2025-09-02",
|
73 |
+
"results-dataset": "smol-course/details_HuggingFaceTB__SmolLM3-3B_private"
|
74 |
+
},
|
75 |
+
|
76 |
... # existing submissions
|
77 |
|
78 |
{
|