Spaces:
Running
Running
Delete compile_log_files.py
Browse files- compile_log_files.py +0 -308
compile_log_files.py
DELETED
|
@@ -1,308 +0,0 @@
|
|
| 1 |
-
# Author: Martin Fajcik
|
| 2 |
-
|
| 3 |
-
import argparse
|
| 4 |
-
import copy
|
| 5 |
-
import glob
|
| 6 |
-
import hashlib
|
| 7 |
-
import os
|
| 8 |
-
import json
|
| 9 |
-
import re
|
| 10 |
-
|
| 11 |
-
import jsonlines
|
| 12 |
-
from tqdm import tqdm
|
| 13 |
-
|
| 14 |
-
SUPPORTED_METRICS = [
|
| 15 |
-
"avg_mcauroc", # for classification tasks
|
| 16 |
-
"exact_match", # for QA tasks
|
| 17 |
-
"acc", # for multichoice tasks
|
| 18 |
-
"rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
|
| 19 |
-
"rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility
|
| 20 |
-
"word_perplexity", # for language modeling tasks
|
| 21 |
-
]
|
| 22 |
-
EXTRA_INFO_RELEASE_KEYS = [
|
| 23 |
-
'filtered_resps',
|
| 24 |
-
'doc_id',
|
| 25 |
-
]
|
| 26 |
-
|
| 27 |
-
with open("leaderboard/metadata.json", "r") as f:
|
| 28 |
-
METADATA = json.load(f)
|
| 29 |
-
|
| 30 |
-
# TASK MAP
|
| 31 |
-
# from promptname to taskname
|
| 32 |
-
MAP = {
|
| 33 |
-
'benchmark_agree': 'benczechmark_agree',
|
| 34 |
-
'benchmark_belebele': 'benczechmark_belebele',
|
| 35 |
-
'benchmark_czechnews': 'benczechmark_czechnews',
|
| 36 |
-
'benchmark_subjectivity': 'benczechmark_subjectivity',
|
| 37 |
-
'benczechmark_snli': 'benczechmark_snli',
|
| 38 |
-
'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
|
| 39 |
-
'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
|
| 40 |
-
'propaganda_nazor': 'benczechmark_propaganda_nazor',
|
| 41 |
-
'propaganda_strach': 'benczechmark_propaganda_strach',
|
| 42 |
-
'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
|
| 43 |
-
'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
|
| 44 |
-
'propaganda_lokace': 'benczechmark_propaganda_lokace',
|
| 45 |
-
'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
|
| 46 |
-
'propaganda_vina': 'benczechmark_propaganda_vina',
|
| 47 |
-
'propaganda_zanr': 'benczechmark_propaganda_zanr',
|
| 48 |
-
'propaganda_emoce': 'benczechmark_propaganda_emoce',
|
| 49 |
-
'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
|
| 50 |
-
'propaganda_rusko': 'benczechmark_propaganda_rusko',
|
| 51 |
-
'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
|
| 52 |
-
'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
|
| 53 |
-
'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
|
| 54 |
-
'benczechmark_summarization': 'benczechmark_summarization',
|
| 55 |
-
'gec': 'benczechmark_grammarerrorcorrection',
|
| 56 |
-
'cs_nq_open': 'benczechmark_cs_naturalquestions',
|
| 57 |
-
'cs_sqad_open': 'benczechmark_cs_sqad32',
|
| 58 |
-
'cs_triviaqa': 'benczechmark_cs_triviaQA',
|
| 59 |
-
'csfever': 'benczechmark_csfever_nli',
|
| 60 |
-
'ctkfacts': 'benczechmark_ctkfacts_nli',
|
| 61 |
-
'cnec_ner': 'benczechmark_cs_ner',
|
| 62 |
-
'cdec_ner': 'benczechmark_cs_court_decisions_ner',
|
| 63 |
-
'klokan_qa': 'benczechmark_klokan_qa',
|
| 64 |
-
'umimeto_biology': 'benczechmark_umimeto_biology',
|
| 65 |
-
'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
|
| 66 |
-
'umimeto_czech': 'benczechmark_umimeto_czech',
|
| 67 |
-
'umimeto_history': 'benczechmark_umimeto_history',
|
| 68 |
-
'umimeto_informatics': 'benczechmark_umimeto_informatics',
|
| 69 |
-
'umimeto_math': 'benczechmark_umimeto_math',
|
| 70 |
-
'umimeto_physics': 'benczechmark_umimeto_physics',
|
| 71 |
-
'cermat_czech_open': 'benczechmark_cermat_czech_open',
|
| 72 |
-
'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
|
| 73 |
-
'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
|
| 74 |
-
'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
|
| 75 |
-
'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
|
| 76 |
-
'history_ir': 'benczechmark_history_ir',
|
| 77 |
-
'benczechmark_histcorpus': "benczechmark_histcorpus",
|
| 78 |
-
'benczechmark_hellaswag': "benczechmark_hellaswag",
|
| 79 |
-
'benczechmark_essay': 'benczechmark_essay',
|
| 80 |
-
'benczechmark_fiction': 'benczechmark_fiction',
|
| 81 |
-
'benczechmark_capek': 'benczechmark_capek',
|
| 82 |
-
'benczechmark_correspondence': 'benczechmark_correspondence',
|
| 83 |
-
'benczechmark_havlicek': 'benczechmark_havlicek',
|
| 84 |
-
'benczechmark_speeches': 'benczechmark_speeches',
|
| 85 |
-
'benczechmark_spoken': 'benczechmark_spoken',
|
| 86 |
-
'benczechmark_dialect': 'benczechmark_dialect'
|
| 87 |
-
}
|
| 88 |
-
|
| 89 |
-
NO_PROMPT_TASKS = ["benczechmark_histcorpus",
|
| 90 |
-
"benczechmark_hellaswag",
|
| 91 |
-
"benczechmark_essay",
|
| 92 |
-
"benczechmark_fiction",
|
| 93 |
-
"benczechmark_capek",
|
| 94 |
-
"benczechmark_correspondence",
|
| 95 |
-
"benczechmark_havlicek",
|
| 96 |
-
"benczechmark_speeches",
|
| 97 |
-
"benczechmark_spoken",
|
| 98 |
-
"benczechmark_dialect"]
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def resolve_taskname(taskname):
|
| 102 |
-
if taskname not in MAP:
|
| 103 |
-
raise ValueError(f"Taskname {taskname} not found.")
|
| 104 |
-
return MAP[taskname]
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def rename_keys(d, resolve_taskname):
|
| 108 |
-
orig_len = len(d)
|
| 109 |
-
for k, v in list(d.items()):
|
| 110 |
-
new_key = resolve_taskname(k)
|
| 111 |
-
d[new_key] = d.pop(k)
|
| 112 |
-
|
| 113 |
-
# make sure list length didnt changed
|
| 114 |
-
assert len(d) == orig_len
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
def process_harness_logs(input_folders, output_file):
|
| 118 |
-
"""
|
| 119 |
-
- Selects best prompt for each task
|
| 120 |
-
- Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
|
| 121 |
-
"""
|
| 122 |
-
|
| 123 |
-
def expand_input_folders(input_folders):
|
| 124 |
-
# Check if input_folders is a wildcard pattern
|
| 125 |
-
if '*' in input_folders or '?' in input_folders:
|
| 126 |
-
# Expand the wildcard into a list of matching directories
|
| 127 |
-
matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
|
| 128 |
-
return matching_directories
|
| 129 |
-
else:
|
| 130 |
-
# If it's not a wildcard, return the input as a single-item list if it's a valid directory
|
| 131 |
-
if os.path.isdir(input_folders):
|
| 132 |
-
return [input_folders]
|
| 133 |
-
else:
|
| 134 |
-
return []
|
| 135 |
-
|
| 136 |
-
input_folders = expand_input_folders(input_folders)
|
| 137 |
-
|
| 138 |
-
per_task_results = {}
|
| 139 |
-
metric_per_task = {}
|
| 140 |
-
predictions = {}
|
| 141 |
-
|
| 142 |
-
all_harness_results = dict()
|
| 143 |
-
for input_folder in tqdm(input_folders, desc="Loading files"):
|
| 144 |
-
# read all files in input_folder
|
| 145 |
-
# consider first folder within this folder
|
| 146 |
-
input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
|
| 147 |
-
# find file which starts with results... prefix in the input_folder
|
| 148 |
-
result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
|
| 149 |
-
with open(os.path.join(input_folder, result_file), "r") as f:
|
| 150 |
-
harness_results = json.load(f)
|
| 151 |
-
all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
|
| 152 |
-
current_multipleprompt_tasknames = []
|
| 153 |
-
for name, result in harness_results['results'].items():
|
| 154 |
-
if name in NO_PROMPT_TASKS:
|
| 155 |
-
# not prompts
|
| 156 |
-
taskname = name
|
| 157 |
-
# process metric names
|
| 158 |
-
for k, v in copy.deepcopy(result).items():
|
| 159 |
-
if "," in k:
|
| 160 |
-
name, _ = k.split(",")
|
| 161 |
-
del result[k]
|
| 162 |
-
result[name] = v
|
| 163 |
-
per_task_results[taskname] = result
|
| 164 |
-
|
| 165 |
-
if result['alias'].strip().startswith('- prompt-'):
|
| 166 |
-
# process taskname
|
| 167 |
-
taskname = name[:-1]
|
| 168 |
-
if taskname.endswith("_"):
|
| 169 |
-
taskname = taskname[:-1]
|
| 170 |
-
|
| 171 |
-
# process metric names
|
| 172 |
-
for k, v in copy.deepcopy(result).items():
|
| 173 |
-
if "," in k:
|
| 174 |
-
name, key = k.split(",")
|
| 175 |
-
del result[k]
|
| 176 |
-
result[name] = v
|
| 177 |
-
|
| 178 |
-
if taskname not in per_task_results:
|
| 179 |
-
per_task_results[taskname] = [result]
|
| 180 |
-
current_multipleprompt_tasknames.append(taskname)
|
| 181 |
-
else:
|
| 182 |
-
per_task_results[taskname].append(result)
|
| 183 |
-
|
| 184 |
-
# get best result according to metric priority given in SUPPORTED_METRICS list
|
| 185 |
-
for taskname, results in per_task_results.items():
|
| 186 |
-
if not taskname in current_multipleprompt_tasknames:
|
| 187 |
-
continue
|
| 188 |
-
best_result = None
|
| 189 |
-
target_metric = None
|
| 190 |
-
for m in SUPPORTED_METRICS:
|
| 191 |
-
if m in results[0]:
|
| 192 |
-
target_metric = m
|
| 193 |
-
break
|
| 194 |
-
if target_metric is None:
|
| 195 |
-
raise ValueError(f"No supported metric found in {taskname}")
|
| 196 |
-
metric_per_task[taskname] = target_metric
|
| 197 |
-
|
| 198 |
-
all_measured_results = []
|
| 199 |
-
for result in results:
|
| 200 |
-
all_measured_results.append(result[target_metric])
|
| 201 |
-
if best_result is None:
|
| 202 |
-
best_result = result
|
| 203 |
-
else:
|
| 204 |
-
if result[target_metric] > best_result[target_metric]:
|
| 205 |
-
best_result = result
|
| 206 |
-
# Compute max-centered variance
|
| 207 |
-
max_value = best_result[target_metric]
|
| 208 |
-
squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
|
| 209 |
-
max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
|
| 210 |
-
best_result['max_centered_variance'] = max_centered_variance
|
| 211 |
-
|
| 212 |
-
per_task_results[taskname] = best_result
|
| 213 |
-
|
| 214 |
-
for file in os.listdir(input_folder):
|
| 215 |
-
if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
|
| 216 |
-
continue
|
| 217 |
-
for taskname in per_task_results.keys():
|
| 218 |
-
if taskname in file:
|
| 219 |
-
print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
|
| 220 |
-
# check this file corresponds to same prompt
|
| 221 |
-
winning_prompt = per_task_results[taskname]['alias'][-1]
|
| 222 |
-
if taskname in NO_PROMPT_TASKS:
|
| 223 |
-
current_prompt = "-1"
|
| 224 |
-
else:
|
| 225 |
-
try:
|
| 226 |
-
current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
|
| 227 |
-
except AttributeError:
|
| 228 |
-
raise ValueError(f"Prompt not found in {file}")
|
| 229 |
-
if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
|
| 230 |
-
# load file contents
|
| 231 |
-
predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
|
| 232 |
-
# only keep data necessary for metrics
|
| 233 |
-
for prediction in predictions[taskname]:
|
| 234 |
-
for key in list(prediction.keys()):
|
| 235 |
-
if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
|
| 236 |
-
del prediction[key]
|
| 237 |
-
|
| 238 |
-
# rename keys (tasknames) using resolve_tasknames:
|
| 239 |
-
rename_keys(predictions, resolve_taskname)
|
| 240 |
-
rename_keys(per_task_results, resolve_taskname)
|
| 241 |
-
|
| 242 |
-
# assert keys in predictions and results are the same
|
| 243 |
-
# assert set(predictions.keys()) == set(per_task_results.keys())
|
| 244 |
-
if not set(predictions.keys()) == set(per_task_results.keys()):
|
| 245 |
-
# print missing keys
|
| 246 |
-
print("Missing keys in predictions:")
|
| 247 |
-
print(set(predictions.keys()) - set(per_task_results.keys()))
|
| 248 |
-
# print extra keys
|
| 249 |
-
print("Extra keys in predictions:")
|
| 250 |
-
print(set(per_task_results.keys()) - set(predictions.keys()))
|
| 251 |
-
raise ValueError("Keys in predictions and results are not the same")
|
| 252 |
-
|
| 253 |
-
aggregated_predictions = dict()
|
| 254 |
-
aggregated_predictions["predictions"] = predictions
|
| 255 |
-
aggregated_predictions["results"] = per_task_results
|
| 256 |
-
aggregated_predictions["metadata"] = {
|
| 257 |
-
'git_hash': harness_results['git_hash'],
|
| 258 |
-
'transformers_version': harness_results['transformers_version'],
|
| 259 |
-
'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
|
| 260 |
-
'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
|
| 261 |
-
'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
|
| 262 |
-
'eot_token_id': harness_results['eot_token_id'],
|
| 263 |
-
'max_length': harness_results['max_length'],
|
| 264 |
-
'task_hashes': harness_results['task_hashes'],
|
| 265 |
-
'model_source': harness_results['model_source'],
|
| 266 |
-
'model_name': harness_results['model_name'],
|
| 267 |
-
'model_name_sanitized': harness_results['model_name_sanitized'],
|
| 268 |
-
'system_instruction': harness_results['system_instruction'],
|
| 269 |
-
'system_instruction_sha': harness_results['system_instruction_sha'],
|
| 270 |
-
'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
|
| 271 |
-
'chat_template': harness_results['chat_template'],
|
| 272 |
-
'chat_template_sha': harness_results['chat_template_sha'],
|
| 273 |
-
'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
|
| 274 |
-
'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
|
| 275 |
-
}
|
| 276 |
-
|
| 277 |
-
# make sure all tasks are present
|
| 278 |
-
all_tasks = set(METADATA["tasks"].keys())
|
| 279 |
-
all_expected_tasks = set(per_task_results.keys())
|
| 280 |
-
all_missing_tasks = all_tasks - all_expected_tasks
|
| 281 |
-
all_extra_tasks = all_expected_tasks - all_tasks
|
| 282 |
-
if len(all_missing_tasks) > 0:
|
| 283 |
-
EOLN = "\n"
|
| 284 |
-
# print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
|
| 285 |
-
raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment
|
| 286 |
-
if len(all_extra_tasks) > 0:
|
| 287 |
-
EOLN = "\n"
|
| 288 |
-
raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
|
| 289 |
-
with open(output_file, "w") as f:
|
| 290 |
-
json.dump(aggregated_predictions, f)
|
| 291 |
-
print("Success!")
|
| 292 |
-
print("Output saved to", output_file)
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
def main():
|
| 296 |
-
parser = argparse.ArgumentParser(
|
| 297 |
-
description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
|
| 298 |
-
parser.add_argument("-i", "-f", "--input_folder", "--folder",
|
| 299 |
-
help="Folder with unprocessed results from lm harness.", required=True)
|
| 300 |
-
parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
|
| 301 |
-
args = parser.parse_args()
|
| 302 |
-
|
| 303 |
-
process_harness_logs(args.input_folder, args.output_file)
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
if __name__ == "__main__":
|
| 307 |
-
main()
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|