Spaces:
Running
Running
Try analyze winscore with bokeh
Browse files- analyze_winscore.py +181 -0
- app.py +4 -8
- server.py +37 -0
analyze_winscore.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
|
| 3 |
+
import csv
|
| 4 |
+
import random
|
| 5 |
+
import numpy as np
|
| 6 |
+
from bokeh.plotting import figure
|
| 7 |
+
from bokeh.models import LabelSet, LogScale
|
| 8 |
+
from bokeh.palettes import Turbo256 # A color palette with enough colors
|
| 9 |
+
from bokeh.models import ColumnDataSource
|
| 10 |
+
|
| 11 |
+
# Function to fit a polynomial curve and return the x and y values of the fitted curve
|
| 12 |
+
def fit_curve(x, y, degree=1):
|
| 13 |
+
# Fit a polynomial of given degree
|
| 14 |
+
coeffs = np.polyfit(x, y, degree)
|
| 15 |
+
poly = np.poly1d(coeffs)
|
| 16 |
+
x_fit = np.linspace(min(x), max(x), 100)
|
| 17 |
+
y_fit = poly(x_fit)
|
| 18 |
+
return x_fit, y_fit
|
| 19 |
+
|
| 20 |
+
# Function to detect and remove outliers using the IQR method
|
| 21 |
+
def remove_outliers(x, y):
|
| 22 |
+
x = np.array(x)
|
| 23 |
+
y = np.array(y)
|
| 24 |
+
|
| 25 |
+
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
|
| 26 |
+
Q1_x, Q3_x = np.percentile(x, [25, 75])
|
| 27 |
+
Q1_y, Q3_y = np.percentile(y, [25, 75])
|
| 28 |
+
|
| 29 |
+
IQR_x = Q3_x - Q1_x
|
| 30 |
+
IQR_y = Q3_y - Q1_y
|
| 31 |
+
|
| 32 |
+
# Define bounds for outliers
|
| 33 |
+
lower_bound_x = Q1_x - 1.5 * IQR_x
|
| 34 |
+
upper_bound_x = Q3_x + 1.5 * IQR_x
|
| 35 |
+
lower_bound_y = Q1_y - 1.5 * IQR_y
|
| 36 |
+
upper_bound_y = Q3_y + 1.5 * IQR_y
|
| 37 |
+
|
| 38 |
+
# Filter out outliers
|
| 39 |
+
mask_x = (x >= lower_bound_x) & (x <= upper_bound_x)
|
| 40 |
+
mask_y = (y >= lower_bound_y) & (y <= upper_bound_y)
|
| 41 |
+
mask = mask_x & mask_y
|
| 42 |
+
|
| 43 |
+
return x[mask], y[mask], x[~mask], y[~mask]
|
| 44 |
+
|
| 45 |
+
def get_ldb_records(name_map, csv_file_path):
|
| 46 |
+
model_mapping = {model_title: model_title for model_title in name_map.values()}
|
| 47 |
+
|
| 48 |
+
ldb_records={}
|
| 49 |
+
with open(csv_file_path, mode='r') as file:
|
| 50 |
+
reader = csv.DictReader(file)
|
| 51 |
+
for row in reader:
|
| 52 |
+
if row['Model'].startswith("Qwen/Qwen2.5"):
|
| 53 |
+
continue
|
| 54 |
+
sanitized_name = model_mapping[row['Model']]
|
| 55 |
+
ldb_records[sanitized_name] = row
|
| 56 |
+
|
| 57 |
+
return ldb_records
|
| 58 |
+
|
| 59 |
+
def create_scatter_plot_with_curve_with_variances_named(category, variance_across_categories, x, y, sizes, model_names, ldb_records):
|
| 60 |
+
FONTSIZE = 10
|
| 61 |
+
|
| 62 |
+
# Remove outliers
|
| 63 |
+
x_filtered, y_filtered, x_outliers, y_outliers = remove_outliers(x, y)
|
| 64 |
+
|
| 65 |
+
# Scale the variance to a range suitable for marker sizes (e.g., between 5 and 30)
|
| 66 |
+
min_marker_size = 5
|
| 67 |
+
max_marker_size = 30
|
| 68 |
+
|
| 69 |
+
def scale_variance_to_size(variance):
|
| 70 |
+
# Scale variance to marker size (linear mapping)
|
| 71 |
+
return min_marker_size + (variance - min(variance_across_categories.values())) * (max_marker_size - min_marker_size) / (max(variance_across_categories.values()) - min(variance_across_categories.values()))
|
| 72 |
+
|
| 73 |
+
# Function to get the variance for a given model name
|
| 74 |
+
def get_variance_for_model(model_name):
|
| 75 |
+
print(model_name)
|
| 76 |
+
return variance_across_categories.get(model_name, 0) # Default to 0 if model not found
|
| 77 |
+
|
| 78 |
+
# Get marker sizes and variances for the filtered data
|
| 79 |
+
filtered_variances = [get_variance_for_model(mname) for mname in np.array(model_names)[np.in1d(x, x_filtered)]]
|
| 80 |
+
marker_sizes_filtered = [scale_variance_to_size(var) for var in filtered_variances]
|
| 81 |
+
|
| 82 |
+
# Get marker sizes and variances for the outlier data
|
| 83 |
+
outlier_variances = [get_variance_for_model(mname) for mname in np.array(model_names)[np.in1d(x, x_outliers)]]
|
| 84 |
+
marker_sizes_outliers = [scale_variance_to_size(var) for var in outlier_variances]
|
| 85 |
+
|
| 86 |
+
# Randomly assign symbols to the filtered data points
|
| 87 |
+
filtered_symbols = ['circle' if ldb_records[mname]['Type'] == 'chat' else 'triangle' for mname in np.array(model_names)[np.in1d(x, x_filtered)]]
|
| 88 |
+
|
| 89 |
+
# Randomly assign symbols to the outlier data points
|
| 90 |
+
outlier_symbols = ['circle' if ldb_records[mname]['Type'] == 'chat' else 'triangle' for mname in np.array(model_names)[np.in1d(x, x_outliers)]]
|
| 91 |
+
|
| 92 |
+
# Define a color palette with enough colors
|
| 93 |
+
stride = len(Turbo256) // len(model_names)
|
| 94 |
+
color_palette = list(Turbo256[::stride]) # Adjust this palette size based on the number of data points
|
| 95 |
+
random.shuffle(color_palette)
|
| 96 |
+
|
| 97 |
+
# Create unique colors for filtered data
|
| 98 |
+
filtered_colors = [color_palette[i % len(color_palette)] for i in range(len(x_filtered))]
|
| 99 |
+
|
| 100 |
+
# Create unique colors for outliers
|
| 101 |
+
outlier_colors = [color_palette[(i + len(x_filtered)) % len(color_palette)] for i in range(len(x_outliers))]
|
| 102 |
+
|
| 103 |
+
# Create ColumnDataSource with filtered data
|
| 104 |
+
source_filtered = ColumnDataSource(data={
|
| 105 |
+
'x': x_filtered,
|
| 106 |
+
'y': y_filtered,
|
| 107 |
+
'sizes': np.array(sizes)[np.in1d(x, x_filtered)], # Keep original model sizes
|
| 108 |
+
'marker_sizes': marker_sizes_filtered, # New field for marker sizes based on variance
|
| 109 |
+
'model_names': np.array(model_names)[np.in1d(x, x_filtered)],
|
| 110 |
+
'variance': filtered_variances, # New field for variance
|
| 111 |
+
'color': filtered_colors,
|
| 112 |
+
'symbol': filtered_symbols
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
# Create ColumnDataSource with outlier data
|
| 116 |
+
source_outliers = ColumnDataSource(data={
|
| 117 |
+
'x': x_outliers,
|
| 118 |
+
'y': y_outliers,
|
| 119 |
+
'sizes': np.array(sizes)[np.in1d(x, x_outliers)], # Keep original model sizes
|
| 120 |
+
'marker_sizes': marker_sizes_outliers, # New field for marker sizes based on variance
|
| 121 |
+
'model_names': np.array(model_names)[np.in1d(x, x_outliers)],
|
| 122 |
+
'variance': outlier_variances, # New field for variance
|
| 123 |
+
'color': outlier_colors,
|
| 124 |
+
'symbol': outlier_symbols
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
# Create a figure for the category
|
| 128 |
+
p = figure(#width=900, height=800, #title=f"{category} vs Model Size vs Variance Across Categories",
|
| 129 |
+
#tools="pan,wheel_zoom,box_zoom,reset,save",
|
| 130 |
+
tooltips=[("Model", "@model_names"),
|
| 131 |
+
("Model Size (B parameters)", "@sizes"),
|
| 132 |
+
("Variance", "@variance"), # Added variance to the tooltip
|
| 133 |
+
("Performance", "@y")])
|
| 134 |
+
|
| 135 |
+
# Plot filtered data with unique colors and scaled marker sizes
|
| 136 |
+
p.scatter('x', 'y', size='marker_sizes', source=source_filtered, fill_alpha=0.6, color='color', marker='symbol')
|
| 137 |
+
|
| 138 |
+
# Plot outliers with unique colors and scaled marker sizes
|
| 139 |
+
p.scatter('x', 'y', size='marker_sizes', source=source_outliers, fill_alpha=0.6, color='color', marker='symbol')
|
| 140 |
+
|
| 141 |
+
# Fit and plot a curve
|
| 142 |
+
x_fit, y_fit = fit_curve(x_filtered, y_filtered, degree=1) # You can adjust the degree of the polynomial
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
p.line(x_fit, y_fit, line_color='gray', line_width=2, line_dash='dashed')
|
| 146 |
+
|
| 147 |
+
# Add labels (with slight offset to avoid overlap)
|
| 148 |
+
p.add_layout(LabelSet(x='x', y='y', text='model_names', source=source_filtered,
|
| 149 |
+
x_offset=5, y_offset=8, text_font_size=f"{FONTSIZE-4}pt", text_color='black'))
|
| 150 |
+
|
| 151 |
+
p.add_layout(LabelSet(x='x', y='y', text='model_names', source=source_outliers,
|
| 152 |
+
x_offset=5, y_offset=8, text_font_size=f"{FONTSIZE-4}pt", text_color='black'))
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# Set axis labels
|
| 156 |
+
p.xaxis.axis_label = 'Model Size (B parameters)'
|
| 157 |
+
p.yaxis.axis_label = f'{category}'
|
| 158 |
+
|
| 159 |
+
# Set axis label font sizes
|
| 160 |
+
p.xaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for x-axis label
|
| 161 |
+
p.yaxis.axis_label_text_font_size = f"{FONTSIZE}pt" # Set font size for y-axis label
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# Increase tick label font sizes
|
| 165 |
+
p.xaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase x-axis tick label size
|
| 166 |
+
p.yaxis.major_label_text_font_size = f"{FONTSIZE}pt" # Increase y-axis tick label size
|
| 167 |
+
|
| 168 |
+
#p.x_range.start = 1
|
| 169 |
+
#p.x_range.end = 18
|
| 170 |
+
|
| 171 |
+
#p.y_range.end = 60
|
| 172 |
+
|
| 173 |
+
p.x_scale = LogScale()
|
| 174 |
+
|
| 175 |
+
p.xaxis.ticker = [1,2,4,7,12,15]
|
| 176 |
+
p.xaxis.axis_label_text_font_style = "normal"
|
| 177 |
+
p.yaxis.axis_label_text_font_style = "normal"
|
| 178 |
+
|
| 179 |
+
return p
|
| 180 |
+
|
| 181 |
+
# EOF
|
app.py
CHANGED
|
@@ -6,8 +6,6 @@ import gradio as gr
|
|
| 6 |
from gradio.themes.utils.sizes import text_md
|
| 7 |
from gradio_modal import Modal
|
| 8 |
|
| 9 |
-
from bokeh.plotting import figure
|
| 10 |
-
|
| 11 |
from content import (
|
| 12 |
HEADER_MARKDOWN,
|
| 13 |
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
|
@@ -628,12 +626,10 @@ def gradio_app():
|
|
| 628 |
gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
| 629 |
|
| 630 |
with gr.Row():
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
fig.circle(x, y0, size=10, color="navy", alpha=0.5)
|
| 636 |
-
p1 = gr.Plot(value=fig, label='Plot 1')
|
| 637 |
|
| 638 |
with gr.Row():
|
| 639 |
leaderboard_category_of_tasks = gr.Dropdown(
|
|
|
|
| 6 |
from gradio.themes.utils.sizes import text_md
|
| 7 |
from gradio_modal import Modal
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from content import (
|
| 10 |
HEADER_MARKDOWN,
|
| 11 |
LEADERBOARD_TAB_TITLE_MARKDOWN,
|
|
|
|
| 626 |
gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
| 627 |
|
| 628 |
with gr.Row():
|
| 629 |
+
gr.Plot(
|
| 630 |
+
value=leaderboard_server.get_bokeh_figure(),
|
| 631 |
+
label='Foo',
|
| 632 |
+
)
|
|
|
|
|
|
|
| 633 |
|
| 634 |
with gr.Row():
|
| 635 |
leaderboard_category_of_tasks = gr.Dropdown(
|
server.py
CHANGED
|
@@ -622,6 +622,43 @@ class LeaderboardServer:
|
|
| 622 |
dataframe.to_csv(filepath, index=False)
|
| 623 |
return filepath
|
| 624 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 625 |
def get_leaderboard_csv(self, pre_submit=None, category=None):
|
| 626 |
if pre_submit == None:
|
| 627 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|
|
|
|
| 622 |
dataframe.to_csv(filepath, index=False)
|
| 623 |
return filepath
|
| 624 |
|
| 625 |
+
def get_bokeh_figure(self):
|
| 626 |
+
import numpy as np
|
| 627 |
+
from analyze_winscore import get_ldb_records, create_scatter_plot_with_curve_with_variances_named
|
| 628 |
+
|
| 629 |
+
#m = self.TASKS_METADATA
|
| 630 |
+
#tournament = self.tournament_results
|
| 631 |
+
name_map = self.submission_id_to_model_title
|
| 632 |
+
|
| 633 |
+
category = self.TASKS_CATEGORY_OVERALL
|
| 634 |
+
csv_file_path = self.leaderboard_dataframes_csv[category]
|
| 635 |
+
ldb_records = get_ldb_records(name_map, csv_file_path)
|
| 636 |
+
categories = self.TASKS_CATEGORIES
|
| 637 |
+
model_names = list(ldb_records.keys())
|
| 638 |
+
sizes = [float(ldb_records[model]['# θ (B)']) for model in model_names]
|
| 639 |
+
average_performance = [float(ldb_records[model]['Average ⬆️']) for model in model_names]
|
| 640 |
+
|
| 641 |
+
variances={}
|
| 642 |
+
for model, record in ldb_records.items():
|
| 643 |
+
r = [float(record[cat]) for cat in categories]
|
| 644 |
+
variances[model] = np.var(r)
|
| 645 |
+
|
| 646 |
+
print(variances)
|
| 647 |
+
print(min(variances.values()))
|
| 648 |
+
variance_across_categories = variances
|
| 649 |
+
|
| 650 |
+
fig = create_scatter_plot_with_curve_with_variances_named(
|
| 651 |
+
'Overall Duel Win Score',
|
| 652 |
+
variance_across_categories,
|
| 653 |
+
sizes,
|
| 654 |
+
average_performance,
|
| 655 |
+
sizes,
|
| 656 |
+
model_names,
|
| 657 |
+
ldb_records,
|
| 658 |
+
)
|
| 659 |
+
|
| 660 |
+
return fig
|
| 661 |
+
|
| 662 |
def get_leaderboard_csv(self, pre_submit=None, category=None):
|
| 663 |
if pre_submit == None:
|
| 664 |
category = category if category else self.TASKS_CATEGORY_OVERALL
|