|  | import pandas as pd | 
					
						
						|  | import numpy as np | 
					
						
						|  | import plotly.express as px | 
					
						
						|  | from plotly.graph_objs import Figure | 
					
						
						|  |  | 
					
						
						|  | from src.leaderboard.filter_models import FLAGGED_MODELS | 
					
						
						|  | from src.display.utils import human_baseline_row as HUMAN_BASELINE, AutoEvalColumn, Tasks, Task, BENCHMARK_COLS | 
					
						
						|  | from src.leaderboard.read_evals import EvalResult | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_scores_df(raw_data: list[EvalResult]) -> pd.DataFrame: | 
					
						
						|  | """ | 
					
						
						|  | Generates a DataFrame containing the maximum scores until each date. | 
					
						
						|  |  | 
					
						
						|  | :param results_df: A DataFrame containing result information including metric scores and dates. | 
					
						
						|  | :return: A new DataFrame containing the maximum scores until each date for every metric. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | results_df = pd.DataFrame(raw_data) | 
					
						
						|  |  | 
					
						
						|  | results_df.sort_values(by="date", inplace=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | scores = {k: [] for k in BENCHMARK_COLS + [AutoEvalColumn.average.name]} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for task in [t.value for t in Tasks] + [Task("Average", "avg", AutoEvalColumn.average.name)]: | 
					
						
						|  | current_max = 0 | 
					
						
						|  | last_date = "" | 
					
						
						|  | column = task.col_name | 
					
						
						|  | for _, row in results_df.iterrows(): | 
					
						
						|  | current_model = row["full_model"] | 
					
						
						|  | if current_model in FLAGGED_MODELS: | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | current_date = row["date"] | 
					
						
						|  | if task.benchmark == "Average": | 
					
						
						|  | current_score = np.mean(list(row["results"].values())) | 
					
						
						|  | else: | 
					
						
						|  | current_score = row["results"][task.benchmark] | 
					
						
						|  |  | 
					
						
						|  | if current_score > current_max: | 
					
						
						|  | if current_date == last_date and len(scores[column]) > 0: | 
					
						
						|  | scores[column][-1] = {"model": current_model, "date": current_date, "score": current_score} | 
					
						
						|  | else: | 
					
						
						|  | scores[column].append({"model": current_model, "date": current_date, "score": current_score}) | 
					
						
						|  | current_max = current_score | 
					
						
						|  | last_date = current_date | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return {k: pd.DataFrame(v) for k, v in scores.items()} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_plot_df(scores_df: dict[str: pd.DataFrame]) -> pd.DataFrame: | 
					
						
						|  | """ | 
					
						
						|  | Transforms the scores DataFrame into a new format suitable for plotting. | 
					
						
						|  |  | 
					
						
						|  | :param scores_df: A DataFrame containing metric scores and dates. | 
					
						
						|  | :return: A new DataFrame reshaped for plotting purposes. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | dfs = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for col in BENCHMARK_COLS + [AutoEvalColumn.average.name]: | 
					
						
						|  | d = scores_df[col].reset_index(drop=True) | 
					
						
						|  | d["task"] = col | 
					
						
						|  | dfs.append(d) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | concat_df = pd.concat(dfs, ignore_index=True) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | concat_df.sort_values(by="date", inplace=True) | 
					
						
						|  | concat_df.reset_index(drop=True, inplace=True) | 
					
						
						|  | return concat_df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def create_metric_plot_obj( | 
					
						
						|  | df: pd.DataFrame, metrics: list[str], title: str | 
					
						
						|  | ) -> Figure: | 
					
						
						|  | """ | 
					
						
						|  | Create a Plotly figure object with lines representing different metrics | 
					
						
						|  | and horizontal dotted lines representing human baselines. | 
					
						
						|  |  | 
					
						
						|  | :param df: The DataFrame containing the metric values, names, and dates. | 
					
						
						|  | :param metrics: A list of strings representing the names of the metrics | 
					
						
						|  | to be included in the plot. | 
					
						
						|  | :param title: A string representing the title of the plot. | 
					
						
						|  | :return: A Plotly figure object with lines representing metrics and | 
					
						
						|  | horizontal dotted lines representing human baselines. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | df = df[df["task"].isin(metrics)] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | fig = px.line( | 
					
						
						|  | df, | 
					
						
						|  | x="date", | 
					
						
						|  | y="score", | 
					
						
						|  | color="task", | 
					
						
						|  | markers=True, | 
					
						
						|  | custom_data=["task", "score", "model"], | 
					
						
						|  | title=title, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | fig.update_traces( | 
					
						
						|  | hovertemplate="<br>".join( | 
					
						
						|  | [ | 
					
						
						|  | "Model Name: %{customdata[2]}", | 
					
						
						|  | "Metric Name: %{customdata[0]}", | 
					
						
						|  | "Date: %{x}", | 
					
						
						|  | "Metric Value: %{y}", | 
					
						
						|  | ] | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | fig.update_layout(yaxis_range=[0, 100]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | metric_color_mapping = {} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for trace in fig.data: | 
					
						
						|  | metric_color_mapping[trace.name] = trace.line.color | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for metric, value in filtered_human_baselines.items(): | 
					
						
						|  | color = metric_color_mapping.get(metric, "blue") | 
					
						
						|  | location = "top left" if metric == "HellaSwag" else "bottom left" | 
					
						
						|  |  | 
					
						
						|  | fig.add_hline( | 
					
						
						|  | y=value, | 
					
						
						|  | line_dash="dot", | 
					
						
						|  | annotation_text=f"{metric} human baseline", | 
					
						
						|  | annotation_position=location, | 
					
						
						|  | annotation_font_size=10, | 
					
						
						|  | annotation_font_color=color, | 
					
						
						|  | line_color=color, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  |