xeon27
commited on
Commit
·
5438c77
1
Parent(s):
e1d7bbb
Add relevant model links
Browse files- refactor_eval_results.py +38 -15
refactor_eval_results.py
CHANGED
|
@@ -16,12 +16,30 @@ METRIC_NAME = {
|
|
| 16 |
"math": "accuracy",
|
| 17 |
"mmlu": "accuracy",
|
| 18 |
"mmlu_pro": "accuracy",
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# agentic
|
| 21 |
"gaia": "mean",
|
| 22 |
"gdm_intercode_ctf": "accuracy",
|
| 23 |
}
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def combine_eval_results(results_path: str, model_name: str) -> dict:
|
| 27 |
results = dict(
|
|
@@ -29,7 +47,7 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
|
|
| 29 |
"config": {
|
| 30 |
"model_name": model_name,
|
| 31 |
# dummy keys
|
| 32 |
-
"model_sha": model_name,
|
| 33 |
"model_dtype": "torch.float16",
|
| 34 |
},
|
| 35 |
"results": {},
|
|
@@ -38,21 +56,26 @@ def combine_eval_results(results_path: str, model_name: str) -> dict:
|
|
| 38 |
for file in os.listdir(os.path.join(results_path, model_name)):
|
| 39 |
if file.endswith(".json"):
|
| 40 |
with open(os.path.join(results_path, model_name, file), "r") as f:
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
| 53 |
}
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
return results
|
| 57 |
|
| 58 |
|
|
|
|
| 16 |
"math": "accuracy",
|
| 17 |
"mmlu": "accuracy",
|
| 18 |
"mmlu_pro": "accuracy",
|
| 19 |
+
"mmmu_multiple_choice": "accuracy",
|
| 20 |
+
"mmmu_open": "accuracy",
|
| 21 |
|
| 22 |
# agentic
|
| 23 |
"gaia": "mean",
|
| 24 |
"gdm_intercode_ctf": "accuracy",
|
| 25 |
}
|
| 26 |
|
| 27 |
+
MODEL_SHA_MAP = {
|
| 28 |
+
# open source models
|
| 29 |
+
"c4ai-command-r-plus": "https://huggingface.co/CohereForAI/c4ai-command-r-plus", # TODO: verify for the 08-2024 version
|
| 30 |
+
"Meta-Llama-3.1-70B-Instruct": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
|
| 31 |
+
"Mistral-Large-Instruct-2407": "https://huggingface.co/mistralai/Mistral-Large-Instruct-2407",
|
| 32 |
+
"Qwen2.5-72B-Instruct": "https://huggingface.co/Qwen/Qwen2.5-72B-Instruct",
|
| 33 |
+
|
| 34 |
+
# closed source models
|
| 35 |
+
"claude-3-5-sonnet-20241022": "https://www.anthropic.com/claude/sonnet",
|
| 36 |
+
"gemini-1.5-flash": "https://deepmind.google/technologies/gemini/flash", # TODO: points to 2.0, can't find page for 1.5
|
| 37 |
+
"gemini-1.5-pro": "https://deepmind.google/technologies/gemini/pro",
|
| 38 |
+
"gpt-4o": "https://openai.com/index/hello-gpt-4o",
|
| 39 |
+
"gpt-4o-mini": "https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence",
|
| 40 |
+
"o1": "https://openai.com/o1",
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
|
| 44 |
def combine_eval_results(results_path: str, model_name: str) -> dict:
|
| 45 |
results = dict(
|
|
|
|
| 47 |
"config": {
|
| 48 |
"model_name": model_name,
|
| 49 |
# dummy keys
|
| 50 |
+
"model_sha": MODEL_SHA_MAP[model_name],
|
| 51 |
"model_dtype": "torch.float16",
|
| 52 |
},
|
| 53 |
"results": {},
|
|
|
|
| 56 |
for file in os.listdir(os.path.join(results_path, model_name)):
|
| 57 |
if file.endswith(".json"):
|
| 58 |
with open(os.path.join(results_path, model_name, file), "r") as f:
|
| 59 |
+
try:
|
| 60 |
+
result = json.load(f)
|
| 61 |
+
task_name = result["eval"]["task"].split("/")[-1]
|
| 62 |
+
if task_name == "math":
|
| 63 |
+
metrics = [elm for elm in result["results"]["scores"] if elm["name"] == "expression_equivalance"][0]["metrics"] # TODO: change scorer if required
|
| 64 |
+
else:
|
| 65 |
+
metrics = result["results"]["scores"][0]["metrics"]
|
| 66 |
+
metric_name = metrics[METRIC_NAME[task_name]]["name"]
|
| 67 |
+
metric_value = metrics[METRIC_NAME[task_name]]["value"]
|
| 68 |
+
results["results"].update(
|
| 69 |
+
{
|
| 70 |
+
task_name: {
|
| 71 |
+
metric_name: metric_value
|
| 72 |
+
}
|
| 73 |
}
|
| 74 |
+
)
|
| 75 |
+
except KeyError as e:
|
| 76 |
+
print(f"KeyError: {e}")
|
| 77 |
+
print(model_name)
|
| 78 |
+
print(file)
|
| 79 |
return results
|
| 80 |
|
| 81 |
|