Spaces:
Sleeping
Sleeping
File size: 5,927 Bytes
1719436 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
import json
import pathlib
from copy import deepcopy
import click
import pandas as pd
import pandera.pandas as pa
from tqdm.auto import tqdm
from src.common.data import load_dataset
from src.eval.metrics import grade_to_weight
from src.eval.schema import DatasetEvalSchema
from src.eval.matchers import build_check_function
from src.generate.generators import GenerationAnswer
from src.generate.schema import GeneratedDatasetSchema
from src.common.schema import DatasetSchema, LeaderBoardSchema
def _evaluate_single_answer(
row: dict,
) -> bool:
if pd.isna(row[GeneratedDatasetSchema.generated_answer]):
return False
if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer:
raise ValueError(
f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}",
)
y_pred = row[GeneratedDatasetSchema.generated_answer].answer
if not y_pred:
return False
y_true = row[DatasetSchema.correct_answer]
check_function = build_check_function(
row[DatasetSchema.check_type],
row[DatasetSchema.check_function],
)
try:
result = check_function(
y_true=deepcopy(y_true),
y_pred=deepcopy(y_pred),
)
except Exception as e:
print(e)
print(f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}")
exit(1)
return result
@pa.check_input(GeneratedDatasetSchema)
@pa.check_output(DatasetEvalSchema)
def _evaluate(
generated_df: pd.DataFrame,
) -> pd.DataFrame:
tqdm.pandas()
generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None,
)
dataset_df = load_dataset()
predictions_df = dataset_df.join(
generated_df.set_index(GeneratedDatasetSchema.id_),
on=DatasetSchema.id_,
)
predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply(
_evaluate_single_answer,
axis=1,
)
predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: x.answer if not pd.isna(x) else None,
)
predictions_df[DatasetEvalSchema.context] = predictions_df[GeneratedDatasetSchema.generated_answer].apply(
lambda x: x.context if not pd.isna(x) else None,
)
predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())]
return predictions_df
@click.command()
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b.jsonl"),
)
def evaluate(
file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"),
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
evaluated_df = _evaluate(df)
evaluated_df.to_json(file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False)
@pa.check_input(DatasetEvalSchema)
@pa.check_output(LeaderBoardSchema)
def _metrics(
df: pd.DataFrame,
model_name: str,
model_size: float,
model_url: str,
model_config: str
) -> pd.DataFrame:
pass1 = df[DatasetEvalSchema.is_correct].mean()
w = df[DatasetEvalSchema.grade].apply(grade_to_weight)
weighted_accuracy = (df[DatasetEvalSchema.is_correct].astype(int) * w).sum() / w.sum()
arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][DatasetEvalSchema.is_correct].mean()
geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][DatasetEvalSchema.is_correct].mean()
logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][DatasetEvalSchema.is_correct].mean()
result = {
LeaderBoardSchema.model_name: model_name,
LeaderBoardSchema.model_size: model_size,
LeaderBoardSchema.model_url: model_url,
LeaderBoardSchema.config: str(model_config),
LeaderBoardSchema.pass1: pass1,
LeaderBoardSchema.weighted_pass1: weighted_accuracy,
LeaderBoardSchema.arith_pass1: arith_pass1,
LeaderBoardSchema.geometry_pass1: geometry_pass1,
LeaderBoardSchema.logic_pass1: logic_pass1,
}
result_df = pd.DataFrame([result])
result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())]
return result_df
@click.command()
@click.option(
"--model-name",
type=str,
required=True,
help="Name of the model being evaluated.",
)
@click.option(
"--file",
type=click.Path(exists=True, dir_okay=False, readable=True, resolve_path=True),
default=pathlib.Path("./gemma3:4b_eval.jsonl"),
)
@click.option(
"--model-size",
type=float,
default=None,
help="Size of the model in billions of parameters.",
)
@click.option(
"--model-url",
type=str,
default=None,
help="URL where the model can be accessed.",
)
@click.option(
"--model-config",
type=str,
default=None,
help="Model configuration in dict format.",
)
def metrics(
model_name: str,
file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"),
model_size: float = None,
model_url: str = None,
model_config: str = None,
):
file = pathlib.Path(file)
df = pd.read_json(file, lines=True)
metrics_df = _metrics(
df,
model_name=model_name,
model_size=model_size,
model_url=model_url,
model_config=model_config or '',
)
metrics = metrics_df.to_dict(orient="records")[0]
print(f"Metrics for {model_name}:")
for key, value in metrics.items():
print(f"{key}: {value}")
json.dump(
metrics_df.to_dict(orient="records"),
open(file.with_suffix(".metrics.json"), "w"),
ensure_ascii=False,
)
if __name__ == "__main__":
evaluate()
|