import json import pathlib from copy import deepcopy import click import pandas as pd import pandera.pandas as pa from tqdm.auto import tqdm from src.common.data import load_dataset from src.eval.metrics import grade_to_weight from src.eval.schema import DatasetEvalSchema from src.eval.matchers import build_check_function from src.generate.generators import GenerationAnswer from src.generate.schema import GeneratedDatasetSchema from src.common.schema import DatasetSchema, LeaderBoardSchema def _evaluate_single_answer( row: dict, ) -> bool: if pd.isna(row[GeneratedDatasetSchema.generated_answer]): return False if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer: raise ValueError( f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}", ) y_pred = row[GeneratedDatasetSchema.generated_answer].answer if not y_pred: return False y_true = row[DatasetSchema.correct_answer] check_function = build_check_function( row[DatasetSchema.check_type], row[DatasetSchema.check_function], ) try: result = check_function( y_true=deepcopy(y_true), y_pred=deepcopy(y_pred), ) except Exception as e: print(e) print( f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}" ) exit(1) return result @pa.check_input(GeneratedDatasetSchema) @pa.check_output(DatasetEvalSchema) def _evaluate( generated_df: pd.DataFrame, ) -> pd.DataFrame: tqdm.pandas() generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[ GeneratedDatasetSchema.generated_answer ].apply( lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None, ) dataset_df = load_dataset() predictions_df = dataset_df.join( generated_df.set_index(GeneratedDatasetSchema.id_), on=DatasetSchema.id_, ) predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply( _evaluate_single_answer, axis=1, ) predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[ GeneratedDatasetSchema.generated_answer ].apply( lambda x: x.answer if not pd.isna(x) else None, ) predictions_df[DatasetEvalSchema.context] = predictions_df[ GeneratedDatasetSchema.generated_answer ].apply( lambda x: x.context if not pd.isna(x) else None, ) predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())] return predictions_df @click.command() @click.option( "--file", type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True), default=pathlib.Path("./gemma3:4b.jsonl"), ) def evaluate( file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"), ): file = pathlib.Path(file) df = pd.read_json(file, lines=True) evaluated_df = _evaluate(df) evaluated_df.to_json( file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False ) @pa.check_input(DatasetEvalSchema) @pa.check_output(LeaderBoardSchema) def _metrics( df: pd.DataFrame, model_name: str, model_size: float, model_url: str, model_config: str, ) -> pd.DataFrame: pass1 = df[DatasetEvalSchema.is_correct].mean() w = df[DatasetEvalSchema.grade].apply(grade_to_weight) weighted_accuracy = ( df[DatasetEvalSchema.is_correct].astype(int) * w ).sum() / w.sum() arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][ DatasetEvalSchema.is_correct ].mean() geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][ DatasetEvalSchema.is_correct ].mean() logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][ DatasetEvalSchema.is_correct ].mean() result = { LeaderBoardSchema.model_name: model_name, LeaderBoardSchema.model_size: model_size, LeaderBoardSchema.model_url: model_url, LeaderBoardSchema.config: str(model_config), LeaderBoardSchema.pass1: pass1, LeaderBoardSchema.weighted_pass1: weighted_accuracy, LeaderBoardSchema.arith_pass1: arith_pass1, LeaderBoardSchema.geometry_pass1: geometry_pass1, LeaderBoardSchema.logic_pass1: logic_pass1, } result_df = pd.DataFrame([result]) result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())] return result_df @click.command() @click.option( "--model-name", type=str, required=True, help="Name of the model being evaluated.", ) @click.option( "--file", type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True), default=pathlib.Path("./gemma3:4b_eval.jsonl"), ) @click.option( "--model-size", type=float, default=None, help="Size of the model in billions of parameters.", ) @click.option( "--model-url", type=str, default=None, help="URL where the model can be accessed.", ) @click.option( "--model-config", type=str, default=None, help="Model configuration in dict format.", ) def metrics( model_name: str, file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"), model_size: float = None, model_url: str = None, model_config: str = None, ): file = pathlib.Path(file) df = pd.read_json(file, lines=True) metrics_df = _metrics( df, model_name=model_name, model_size=model_size, model_url=model_url, model_config=model_config or "", ) metrics = metrics_df.to_dict(orient="records")[0] print(f"Metrics for {model_name}:") for key, value in metrics.items(): print(f"{key}: {value}") json.dump( metrics_df.to_dict(orient="records"), open(file.with_suffix(".metrics.json"), "w"), ensure_ascii=False, ) if __name__ == "__main__": evaluate()