wip added primary category tasks
Browse files- src/about.py +14 -4
- src/display/utils.py +43 -23
- src/envs.py +2 -2
- src/leaderboard/read_evals.py +4 -0
src/about.py
CHANGED
|
@@ -10,10 +10,20 @@ class Task:
|
|
| 10 |
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 19 |
# ---------------------------------------------------
|
|
|
|
| 10 |
|
| 11 |
# Select your tasks here
|
| 12 |
# ---------------------------------------------------
|
| 13 |
+
if False:
|
| 14 |
+
class Tasks(Enum):
|
| 15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 16 |
+
task0 = Task("anli_r1", "acc", "ANLI")
|
| 17 |
+
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
| 18 |
+
else:
|
| 19 |
+
class Tasks(Enum):
|
| 20 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
| 21 |
+
# NOTE: the task_key must be unique
|
| 22 |
+
task0 = Task("default_primary_subfield_accuracy", "primary_subfield_accuracy", "Accuracy of predicting the released primary category on the default split")
|
| 23 |
+
task1 = Task("default_primary_top_3_tpr", "primary_top_3_tpr", "Occurrence of the released primary category in the top-3 predictions on the default split")
|
| 24 |
+
task2 = Task("default_primary_top_5_tpr", "primary_top_5_tpr", "Occurrence of the released primary category in the top-5 predictions on the default split")
|
| 25 |
+
task3 = Task("default_primary_top_10_tpr", "primary_top_10_tpr", "Occurrence of the released primary category in the top-10 predictions on the default split")
|
| 26 |
+
# task1 = Task("all2023_v2", "acc", "Acc on all 2023 papers")
|
| 27 |
|
| 28 |
NUM_FEWSHOT = 0 # Change with your few shot
|
| 29 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from dataclasses import dataclass, make_dataclass
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
@@ -21,27 +21,46 @@ class ColumnContent:
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
auto_eval_column_dict.append(["
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
auto_eval_column_dict.append(["
|
| 35 |
-
auto_eval_column_dict.append(["
|
| 36 |
-
auto_eval_column_dict.append(["
|
| 37 |
-
auto_eval_column_dict.append(["
|
| 38 |
-
auto_eval_column_dict.append(["
|
| 39 |
-
auto_eval_column_dict.append(["
|
| 40 |
-
auto_eval_column_dict.append(["
|
| 41 |
-
auto_eval_column_dict.append(["
|
| 42 |
-
|
| 43 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
## For the queue columns in the submission tab
|
| 47 |
@dataclass(frozen=True)
|
|
@@ -107,4 +126,5 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
| 107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 108 |
|
| 109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 110 |
-
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, make_dataclass, field
|
| 2 |
from enum import Enum
|
| 3 |
|
| 4 |
import pandas as pd
|
|
|
|
| 21 |
never_hidden: bool = False
|
| 22 |
|
| 23 |
## Leaderboard columns
|
| 24 |
+
if False:
|
| 25 |
+
auto_eval_column_dict = []
|
| 26 |
+
# Init
|
| 27 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
| 28 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
| 29 |
+
#Scores
|
| 30 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
| 31 |
+
for task in Tasks:
|
| 32 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
| 33 |
+
# Model information
|
| 34 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
| 35 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
| 36 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
| 37 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
| 38 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
| 39 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
| 40 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
| 41 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
| 42 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
| 43 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
| 44 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
| 45 |
+
|
| 46 |
+
@dataclass(frozen=True)
|
| 47 |
+
class AutoEvalColumn:
|
| 48 |
+
model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
|
| 49 |
+
model = ColumnContent("Model", "markdown", True, never_hidden=True)
|
| 50 |
+
average = ColumnContent("Average ⬆️", "number", True)
|
| 51 |
+
model_type = ColumnContent("Type", "str", False)
|
| 52 |
+
architecture = ColumnContent("Architecture", "str", False)
|
| 53 |
+
weight_type = ColumnContent("Weight type", "str", False, True)
|
| 54 |
+
precision = ColumnContent("Precision", "str", False)
|
| 55 |
+
license = ColumnContent("Hub License", "str", False)
|
| 56 |
+
params = ColumnContent("#Params (B)", "number", False)
|
| 57 |
+
likes = ColumnContent("Hub ❤️", "number", False)
|
| 58 |
+
still_on_hub = ColumnContent("Available on the hub", "bool", False)
|
| 59 |
+
revision = ColumnContent("Model sha", "str", False, False)
|
| 60 |
+
# Dynamically add task columns
|
| 61 |
+
def __init__(self):
|
| 62 |
+
for task in Tasks:
|
| 63 |
+
setattr(self.__class__, task.name, ColumnContent(task.value.col_name, "number", True))
|
| 64 |
|
| 65 |
## For the queue columns in the submission tab
|
| 66 |
@dataclass(frozen=True)
|
|
|
|
| 126 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
| 127 |
|
| 128 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
| 129 |
+
# NOTE: add the benchmark columns to the COLS since they are dynamically added
|
| 130 |
+
COLS += BENCHMARK_COLS
|
src/envs.py
CHANGED
|
@@ -11,8 +11,8 @@ OWNER = "mlcore"
|
|
| 11 |
# ----------------------------------
|
| 12 |
|
| 13 |
REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
|
| 14 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
| 15 |
-
RESULTS_REPO = f"{OWNER}/results"
|
| 16 |
|
| 17 |
# If you setup a cache later, just change HF_HOME
|
| 18 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
|
| 11 |
# ----------------------------------
|
| 12 |
|
| 13 |
REPO_ID = f"{OWNER}/arxiv-classifier-leaderboard"
|
| 14 |
+
QUEUE_REPO = f"{OWNER}/arxiv-classifier-leaderboard-requests"
|
| 15 |
+
RESULTS_REPO = f"{OWNER}/arxiv-classifier-leaderboard-results"
|
| 16 |
|
| 17 |
# If you setup a cache later, just change HF_HOME
|
| 18 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
|
@@ -176,6 +176,9 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 176 |
for model_result_filepath in model_result_filepaths:
|
| 177 |
# Creation of result
|
| 178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
|
|
|
|
|
|
|
| 179 |
eval_result.update_with_request_file(requests_path)
|
| 180 |
|
| 181 |
# Store results of same eval together
|
|
@@ -187,6 +190,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
| 187 |
|
| 188 |
results = []
|
| 189 |
for v in eval_results.values():
|
|
|
|
| 190 |
try:
|
| 191 |
v.to_dict() # we test if the dict version is complete
|
| 192 |
results.append(v)
|
|
|
|
| 176 |
for model_result_filepath in model_result_filepaths:
|
| 177 |
# Creation of result
|
| 178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
| 179 |
+
# TODO: populate requests repo with request files corresponding to llama3-8b_primary
|
| 180 |
+
# Current output of `python app.py`:
|
| 181 |
+
# Could not find request file for None/llama3-8b_primary with precision ?
|
| 182 |
eval_result.update_with_request_file(requests_path)
|
| 183 |
|
| 184 |
# Store results of same eval together
|
|
|
|
| 190 |
|
| 191 |
results = []
|
| 192 |
for v in eval_results.values():
|
| 193 |
+
# import pdb; pdb.set_trace()
|
| 194 |
try:
|
| 195 |
v.to_dict() # we test if the dict version is complete
|
| 196 |
results.append(v)
|