Spaces:
Running
Running
update metric area filter
Browse files- app.py +23 -0
- src/populate.py +0 -4
app.py
CHANGED
|
@@ -24,6 +24,7 @@ def update_table(
|
|
| 24 |
use_case_area_query: list,
|
| 25 |
use_case_query: list,
|
| 26 |
use_case_type_query: list,
|
|
|
|
| 27 |
):
|
| 28 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 29 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
|
@@ -34,7 +35,24 @@ def update_table(
|
|
| 34 |
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
| 35 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 36 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
df = select_columns(filtered_df, columns)
|
|
|
|
| 38 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 39 |
|
| 40 |
|
|
@@ -60,6 +78,7 @@ def init_leaderboard_df(
|
|
| 60 |
use_case_area_query: list,
|
| 61 |
use_case_query: list,
|
| 62 |
use_case_type_query: list,
|
|
|
|
| 63 |
):
|
| 64 |
|
| 65 |
# Applying the style function
|
|
@@ -74,6 +93,7 @@ def init_leaderboard_df(
|
|
| 74 |
use_case_area_query,
|
| 75 |
use_case_query,
|
| 76 |
use_case_type_query,
|
|
|
|
| 77 |
)
|
| 78 |
|
| 79 |
|
|
@@ -232,6 +252,7 @@ with demo:
|
|
| 232 |
filter_use_case_area.value,
|
| 233 |
filter_use_case.value,
|
| 234 |
filter_use_case_type.value,
|
|
|
|
| 235 |
),
|
| 236 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 237 |
datatype=TYPES,
|
|
@@ -256,6 +277,7 @@ with demo:
|
|
| 256 |
filter_use_case_area,
|
| 257 |
filter_use_case,
|
| 258 |
filter_use_case_type,
|
|
|
|
| 259 |
]:
|
| 260 |
selector.change(
|
| 261 |
update_table,
|
|
@@ -269,6 +291,7 @@ with demo:
|
|
| 269 |
filter_use_case_area,
|
| 270 |
filter_use_case,
|
| 271 |
filter_use_case_type,
|
|
|
|
| 272 |
],
|
| 273 |
leaderboard_table,
|
| 274 |
queue=True,
|
|
|
|
| 24 |
use_case_area_query: list,
|
| 25 |
use_case_query: list,
|
| 26 |
use_case_type_query: list,
|
| 27 |
+
metric_area_query: list,
|
| 28 |
):
|
| 29 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
| 30 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
|
|
|
| 35 |
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
| 36 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
| 37 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
| 38 |
+
# Filtering by metric area
|
| 39 |
+
metric_area_maps = {
|
| 40 |
+
"Cost": ["Cost Band"],
|
| 41 |
+
"Accuracy": ["Accuracy", "Instruction Following", "Conciseness", "Completeness", "Factuality"],
|
| 42 |
+
"Speed (Latency)": ["Response Time (Sec)", "Mean Output Tokens"],
|
| 43 |
+
"Trust & Safety": ["Trust & Safety", "Safety", "Privacy", "Truthfulness", "CRM Bias"],
|
| 44 |
+
}
|
| 45 |
+
all_metric_cols = []
|
| 46 |
+
for area in metric_area_maps:
|
| 47 |
+
all_metric_cols = all_metric_cols + metric_area_maps[area]
|
| 48 |
+
|
| 49 |
+
columns_to_keep = list(set(columns).difference(set(all_metric_cols)))
|
| 50 |
+
for area in metric_area_query:
|
| 51 |
+
columns_to_keep = columns_to_keep + metric_area_maps[area]
|
| 52 |
+
columns = list(set(columns).intersection(set(columns_to_keep)))
|
| 53 |
+
|
| 54 |
df = select_columns(filtered_df, columns)
|
| 55 |
+
|
| 56 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
| 57 |
|
| 58 |
|
|
|
|
| 78 |
use_case_area_query: list,
|
| 79 |
use_case_query: list,
|
| 80 |
use_case_type_query: list,
|
| 81 |
+
metric_area_query: list,
|
| 82 |
):
|
| 83 |
|
| 84 |
# Applying the style function
|
|
|
|
| 93 |
use_case_area_query,
|
| 94 |
use_case_query,
|
| 95 |
use_case_type_query,
|
| 96 |
+
metric_area_query,
|
| 97 |
)
|
| 98 |
|
| 99 |
|
|
|
|
| 252 |
filter_use_case_area.value,
|
| 253 |
filter_use_case.value,
|
| 254 |
filter_use_case_type.value,
|
| 255 |
+
filter_metric_area.value,
|
| 256 |
),
|
| 257 |
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
| 258 |
datatype=TYPES,
|
|
|
|
| 277 |
filter_use_case_area,
|
| 278 |
filter_use_case,
|
| 279 |
filter_use_case_type,
|
| 280 |
+
filter_metric_area,
|
| 281 |
]:
|
| 282 |
selector.change(
|
| 283 |
update_table,
|
|
|
|
| 291 |
filter_use_case_area,
|
| 292 |
filter_use_case,
|
| 293 |
filter_use_case_type,
|
| 294 |
+
filter_metric_area,
|
| 295 |
],
|
| 296 |
leaderboard_table,
|
| 297 |
queue=True,
|
src/populate.py
CHANGED
|
@@ -21,8 +21,6 @@ def get_leaderboard_df_crm(
|
|
| 21 |
on="Use Case Name",
|
| 22 |
)
|
| 23 |
|
| 24 |
-
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
| 25 |
-
|
| 26 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
| 27 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
| 28 |
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
|
@@ -61,8 +59,6 @@ def get_leaderboard_df_crm(
|
|
| 61 |
on=["Model Name"],
|
| 62 |
)
|
| 63 |
|
| 64 |
-
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
| 65 |
-
|
| 66 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 67 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 68 |
)
|
|
|
|
| 21 |
on="Use Case Name",
|
| 22 |
)
|
| 23 |
|
|
|
|
|
|
|
| 24 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
| 25 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
| 26 |
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
|
|
|
| 59 |
on=["Model Name"],
|
| 60 |
)
|
| 61 |
|
|
|
|
|
|
|
| 62 |
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
| 63 |
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
| 64 |
)
|