d0rj's picture
feat: Initial commit
1719436
raw
history blame
5.93 kB
import json
import pathlib
from copy import deepcopy
import click
import pandas as pd
import pandera.pandas as pa
from tqdm.auto import tqdm
from src.common.data import load_dataset
from src.eval.metrics import grade_to_weight
from src.eval.schema import DatasetEvalSchema
from src.eval.matchers import build_check_function
from src.generate.generators import GenerationAnswer
from src.generate.schema import GeneratedDatasetSchema
from src.common.schema import DatasetSchema, LeaderBoardSchema
def _evaluate_single_answer(
row: dict,
) -> bool:
if pd.isna(row[GeneratedDatasetSchema.generated_answer]):
return False
if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer:
raise ValueError(
f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}",
)
y_pred = row[GeneratedDatasetSchema.generated_answer].answer
if not y_pred:
return False
y_true = row[DatasetSchema.correct_answer]
check_function = build_check_function(
row[DatasetSchema.check_type],
row[DatasetSchema.check_function],
)
try:
result = check_function(
y_true=deepcopy(y_true),
y_pred=deepcopy(y_pred),
)
except Exception as e:
print(e)
print(f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}")
exit(1)
return result
@pa.check_input(GeneratedDatasetSchema)
@pa.check_output(DatasetEvalSchema)
def _evaluate(
generated_df: pd.DataFrame,
) -> pd.DataFrame:
tqdm.pandas()
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
)
dataset_df = load_dataset()
predictions_df = dataset_df.join(
generated_df.set_index(GeneratedDatasetSchema.id_),
on=DatasetSchema.id_,
)
predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply(
_evaluate_single_answer,
axis=1,
)
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: x.answer if not pd.isna(x) else None,
)
predictions_df[DatasetEvalSchema.context] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: x.context if not pd.isna(x) else None,
)
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
return predictions_df
@click.command()
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b.jsonl"),
)
def evaluate(
file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
evaluated_df = _evaluate(df)
evaluated_df.to_json(file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False)
@pa.check_input(DatasetEvalSchema)
@pa.check_output(LeaderBoardSchema)
def _metrics(
df: pd.DataFrame,
model_name: str,
model_size: float,
model_url: str,
model_config: str
) -> pd.DataFrame:
pass1 = df[DatasetEvalSchema.is_correct].mean()
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
weighted_accuracy = (df[DatasetEvalSchema.is_correct].astype(int) * w).sum() / w.sum()
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][DatasetEvalSchema.is_correct].mean()
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][DatasetEvalSchema.is_correct].mean()
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][DatasetEvalSchema.is_correct].mean()
result = {
LeaderBoardSchema.model_name: model_name,
LeaderBoardSchema.model_size: model_size,
LeaderBoardSchema.model_url: model_url,
LeaderBoardSchema.config: str(model_config),
LeaderBoardSchema.pass1: pass1,
LeaderBoardSchema.weighted_pass1: weighted_accuracy,
LeaderBoardSchema.arith_pass1: arith_pass1,
LeaderBoardSchema.geometry_pass1: geometry_pass1,
LeaderBoardSchema.logic_pass1: logic_pass1,
}
result_df = pd.DataFrame([result])
result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())]
return result_df
@click.command()
@click.option(
"--model-name",
type=str,
required=True,
help="Name of the model being evaluated.",
)
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b_eval.jsonl"),
)
@click.option(
"--model-size",
type=float,
default=None,
help="Size of the model in billions of parameters.",
)
@click.option(
"--model-url",
type=str,
default=None,
help="URL where the model can be accessed.",
)
@click.option(
"--model-config",
type=str,
default=None,
help="Model configuration in dict format.",
)
def metrics(
model_name: str,
file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"),
model_size: float = None,
model_url: str = None,
model_config: str = None,
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
metrics_df = _metrics(
df,
model_name=model_name,
model_size=model_size,
model_url=model_url,
model_config=model_config or '',
)
metrics = metrics_df.to_dict(orient="records")[0]
print(f"Metrics for {model_name}:")
for key, value in metrics.items():
print(f"{key}: {value}")
json.dump(
metrics_df.to_dict(orient="records"),
open(file.with_suffix(".metrics.json"), "w"),
ensure_ascii=False,
)
if __name__ == "__main__":
evaluate()