Spaces:
Sleeping
Sleeping
import json | |
import pathlib | |
from copy import deepcopy | |
import click | |
import pandas as pd | |
import pandera.pandas as pa | |
from tqdm.auto import tqdm | |
from src.common.data import load_dataset | |
from src.eval.metrics import grade_to_weight | |
from src.eval.schema import DatasetEvalSchema | |
from src.eval.matchers import build_check_function | |
from src.generate.generators import GenerationAnswer | |
from src.generate.schema import GeneratedDatasetSchema | |
from src.common.schema import DatasetSchema, LeaderBoardSchema | |
def _evaluate_single_answer( | |
row: dict, | |
) -> bool: | |
if pd.isna(row[GeneratedDatasetSchema.generated_answer]): | |
return False | |
if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer: | |
raise ValueError( | |
f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}", | |
) | |
y_pred = row[GeneratedDatasetSchema.generated_answer].answer | |
if not y_pred: | |
return False | |
y_true = row[DatasetSchema.correct_answer] | |
check_function = build_check_function( | |
row[DatasetSchema.check_type], | |
row[DatasetSchema.check_function], | |
) | |
try: | |
result = check_function( | |
y_true=deepcopy(y_true), | |
y_pred=deepcopy(y_pred), | |
) | |
except Exception as e: | |
print(e) | |
print( | |
f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}" | |
) | |
exit(1) | |
return result | |
def _evaluate( | |
generated_df: pd.DataFrame, | |
) -> pd.DataFrame: | |
tqdm.pandas() | |
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[ | |
GeneratedDatasetSchema.generated_answer | |
].apply( | |
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None, | |
) | |
dataset_df = load_dataset() | |
predictions_df = dataset_df.join( | |
generated_df.set_index(GeneratedDatasetSchema.id_), | |
on=DatasetSchema.id_, | |
) | |
predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply( | |
_evaluate_single_answer, | |
axis=1, | |
) | |
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[ | |
GeneratedDatasetSchema.generated_answer | |
].apply( | |
lambda x: x.answer if not pd.isna(x) else None, | |
) | |
predictions_df[DatasetEvalSchema.context] = predictions_df[ | |
GeneratedDatasetSchema.generated_answer | |
].apply( | |
lambda x: x.context if not pd.isna(x) else None, | |
) | |
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())] | |
return predictions_df | |
def evaluate( | |
file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"), | |
): | |
file = pathlib.Path(file) | |
df = pd.read_json(file, lines=True) | |
evaluated_df = _evaluate(df) | |
evaluated_df.to_json( | |
file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False | |
) | |
def _metrics( | |
df: pd.DataFrame, | |
model_name: str, | |
model_size: float, | |
model_url: str, | |
model_config: str, | |
) -> pd.DataFrame: | |
pass1 = df[DatasetEvalSchema.is_correct].mean() | |
w = df[DatasetEvalSchema.grade].apply(grade_to_weight) | |
weighted_accuracy = ( | |
df[DatasetEvalSchema.is_correct].astype(int) * w | |
).sum() / w.sum() | |
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][ | |
DatasetEvalSchema.is_correct | |
].mean() | |
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][ | |
DatasetEvalSchema.is_correct | |
].mean() | |
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][ | |
DatasetEvalSchema.is_correct | |
].mean() | |
result = { | |
LeaderBoardSchema.model_name: model_name, | |
LeaderBoardSchema.model_size: model_size, | |
LeaderBoardSchema.model_url: model_url, | |
LeaderBoardSchema.config: str(model_config), | |
LeaderBoardSchema.pass1: pass1, | |
LeaderBoardSchema.weighted_pass1: weighted_accuracy, | |
LeaderBoardSchema.arith_pass1: arith_pass1, | |
LeaderBoardSchema.geometry_pass1: geometry_pass1, | |
LeaderBoardSchema.logic_pass1: logic_pass1, | |
} | |
result_df = pd.DataFrame([result]) | |
result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())] | |
return result_df | |
def metrics( | |
model_name: str, | |
file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"), | |
model_size: float = None, | |
model_url: str = None, | |
model_config: str = None, | |
): | |
file = pathlib.Path(file) | |
df = pd.read_json(file, lines=True) | |
metrics_df = _metrics( | |
df, | |
model_name=model_name, | |
model_size=model_size, | |
model_url=model_url, | |
model_config=model_config or "", | |
) | |
metrics = metrics_df.to_dict(orient="records")[0] | |
print(f"Metrics for {model_name}:") | |
for key, value in metrics.items(): | |
print(f"{key}: {value}") | |
json.dump( | |
metrics_df.to_dict(orient="records"), | |
open(file.with_suffix(".metrics.json"), "w"), | |
ensure_ascii=False, | |
) | |
if __name__ == "__main__": | |
evaluate() | |