d0rj's picture
style: code blacked
3e35a01
import json
import pathlib
from copy import deepcopy
import click
import pandas as pd
import pandera.pandas as pa
from tqdm.auto import tqdm
from src.common.data import load_dataset
from src.eval.metrics import grade_to_weight
from src.eval.schema import DatasetEvalSchema
from src.eval.matchers import build_check_function
from src.generate.generators import GenerationAnswer
from src.generate.schema import GeneratedDatasetSchema
from src.common.schema import DatasetSchema, LeaderBoardSchema
def _evaluate_single_answer(
row: dict,
) -> bool:
if pd.isna(row[GeneratedDatasetSchema.generated_answer]):
return False
if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer:
raise ValueError(
f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}",
)
y_pred = row[GeneratedDatasetSchema.generated_answer].answer
if not y_pred:
return False
y_true = row[DatasetSchema.correct_answer]
check_function = build_check_function(
row[DatasetSchema.check_type],
row[DatasetSchema.check_function],
)
try:
result = check_function(
y_true=deepcopy(y_true),
y_pred=deepcopy(y_pred),
)
except Exception as e:
print(e)
print(
f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}"
)
exit(1)
return result
@pa.check_input(GeneratedDatasetSchema)
@pa.check_output(DatasetEvalSchema)
def _evaluate(
generated_df: pd.DataFrame,
) -> pd.DataFrame:
tqdm.pandas()
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[
GeneratedDatasetSchema.generated_answer
].apply(
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
)
dataset_df = load_dataset()
predictions_df = dataset_df.join(
generated_df.set_index(GeneratedDatasetSchema.id_),
on=DatasetSchema.id_,
)
predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply(
_evaluate_single_answer,
axis=1,
)
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[
GeneratedDatasetSchema.generated_answer
].apply(
lambda x: x.answer if not pd.isna(x) else None,
)
predictions_df[DatasetEvalSchema.context] = predictions_df[
GeneratedDatasetSchema.generated_answer
].apply(
lambda x: x.context if not pd.isna(x) else None,
)
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
return predictions_df
@click.command()
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b.jsonl"),
)
def evaluate(
file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
evaluated_df = _evaluate(df)
evaluated_df.to_json(
file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False
)
@pa.check_input(DatasetEvalSchema)
@pa.check_output(LeaderBoardSchema)
def _metrics(
df: pd.DataFrame,
model_name: str,
model_size: float,
model_url: str,
model_config: str,
) -> pd.DataFrame:
pass1 = df[DatasetEvalSchema.is_correct].mean()
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
weighted_accuracy = (
df[DatasetEvalSchema.is_correct].astype(int) * w
).sum() / w.sum()
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][
DatasetEvalSchema.is_correct
].mean()
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][
DatasetEvalSchema.is_correct
].mean()
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][
DatasetEvalSchema.is_correct
].mean()
result = {
LeaderBoardSchema.model_name: model_name,
LeaderBoardSchema.model_size: model_size,
LeaderBoardSchema.model_url: model_url,
LeaderBoardSchema.config: str(model_config),
LeaderBoardSchema.pass1: pass1,
LeaderBoardSchema.weighted_pass1: weighted_accuracy,
LeaderBoardSchema.arith_pass1: arith_pass1,
LeaderBoardSchema.geometry_pass1: geometry_pass1,
LeaderBoardSchema.logic_pass1: logic_pass1,
}
result_df = pd.DataFrame([result])
result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())]
return result_df
@click.command()
@click.option(
"--model-name",
type=str,
required=True,
help="Name of the model being evaluated.",
)
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b_eval.jsonl"),
)
@click.option(
"--model-size",
type=float,
default=None,
help="Size of the model in billions of parameters.",
)
@click.option(
"--model-url",
type=str,
default=None,
help="URL where the model can be accessed.",
)
@click.option(
"--model-config",
type=str,
default=None,
help="Model configuration in dict format.",
)
def metrics(
model_name: str,
file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"),
model_size: float = None,
model_url: str = None,
model_config: str = None,
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
metrics_df = _metrics(
df,
model_name=model_name,
model_size=model_size,
model_url=model_url,
model_config=model_config or "",
)
metrics = metrics_df.to_dict(orient="records")[0]
print(f"Metrics for {model_name}:")
for key, value in metrics.items():
print(f"{key}: {value}")
json.dump(
metrics_df.to_dict(orient="records"),
open(file.with_suffix(".metrics.json"), "w"),
ensure_ascii=False,
)
if __name__ == "__main__":
evaluate()