burtenshaw commited on
Commit
1c7c01e
·
1 Parent(s): b5eec3d

update app to use lighteval format

Browse files
Files changed (2) hide show
  1. app.py +40 -8
  2. docs.md +8 -1
app.py CHANGED
@@ -9,7 +9,7 @@ from datasets import load_dataset
9
  abs_path = Path(__file__).parent
10
  submissions = json.load(open(abs_path / "submissions.json"))
11
 
12
- TASKS = ["mmlu"]
13
  TYPES = [
14
  "markdown",
15
  "markdown",
@@ -21,14 +21,45 @@ COLUMNS = ["User", "Model Name", "MMLU", "Average ⬆️", "Results"]
21
  WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def load_submissions():
25
  leaderboard = []
26
-
27
  for submission in submissions["submissions"]:
28
  ds = load_dataset(submission["results-dataset"], "results")
29
- ds = ds.filter(lambda x: x["task"] in TASKS)
30
 
31
- all_accuracy = []
 
 
 
32
 
33
  leaderboard_row = {}
34
 
@@ -40,11 +71,12 @@ def load_submissions():
40
  f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
41
  )
42
 
43
- for result in ds["train"]:
44
- leaderboard_row[result["task"]] = result["accuracy"]
45
- all_accuracy.append(result["accuracy"])
46
 
47
- leaderboard_row["Average ⬆️"] = sum(all_accuracy) / len(all_accuracy)
 
 
48
 
49
  leaderboard_row["results-dataset"] = (
50
  f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
 
9
  abs_path = Path(__file__).parent
10
  submissions = json.load(open(abs_path / "submissions.json"))
11
 
12
+ TASKS = [("gsm8k", "lighteval|gsm8k|0", "extractive_match")]
13
  TYPES = [
14
  "markdown",
15
  "markdown",
 
21
  WIDTHS = ["25%", "25%", "15%", "15%", "10%"]
22
 
23
 
24
+ def load_results(dataset):
25
+ results = []
26
+
27
+ try:
28
+ output = dataset["latest"]["results"]
29
+ output = output[-1]
30
+ except KeyError as e:
31
+ raise ValueError("Cannot find 'latest' key in the dataset")
32
+
33
+ try:
34
+ output = json.loads(output)
35
+ except ValueError as e:
36
+ raise ValueError("Cannot parse the output as JSON")
37
+
38
+ for name, task, metric in TASKS:
39
+ try:
40
+ output = output[task]
41
+ except KeyError as e:
42
+ raise ValueError(f"Cannot find '{task}' key in the dataset")
43
+
44
+ try:
45
+ output = (name, output[metric])
46
+ except KeyError as e:
47
+ raise ValueError("Cannot find 'extractive_match' key in the dataset")
48
+
49
+ results.append(output)
50
+
51
+ return results
52
+
53
+
54
  def load_submissions():
55
  leaderboard = []
 
56
  for submission in submissions["submissions"]:
57
  ds = load_dataset(submission["results-dataset"], "results")
 
58
 
59
+ try:
60
+ results = load_results(ds)
61
+ except ValueError as e:
62
+ raise ValueError(f"Cannot load results for {ds['results-dataset']}") from e
63
 
64
  leaderboard_row = {}
65
 
 
71
  f"[{submission['model_name']}](https://huggingface.co/{submission['model_name']})"
72
  )
73
 
74
+ for name, result in results:
75
+ leaderboard_row[name] = result
 
76
 
77
+ leaderboard_row["Average ⬆️"] = sum(result for _, result in results) / len(
78
+ results
79
+ )
80
 
81
  leaderboard_row["results-dataset"] = (
82
  f"[🔗](https://huggingface.co/datasets/{submission['results-dataset']})"
docs.md CHANGED
@@ -65,7 +65,14 @@ Open a pull request on the [leaderboard space](https://huggingface.co/spaces/smo
65
  ```json
66
  {
67
  "submissions": [
68
-
 
 
 
 
 
 
 
69
  ... # existing submissions
70
 
71
  {
 
65
  ```json
66
  {
67
  "submissions": [
68
+ {
69
+ "username": "HuggingFaceTB",
70
+ "model_name": "SmolLM3-3B",
71
+ "chapter": "1",
72
+ "submission_date": "2025-09-02",
73
+ "results-dataset": "smol-course/details_HuggingFaceTB__SmolLM3-3B_private"
74
+ },
75
+
76
  ... # existing submissions
77
 
78
  {