ggcristian commited on
Commit
aaf0c71
·
1 Parent(s): e70391f

New Leaderboard Update

Browse files

- We now have the possibility to filter by task > filter by benchmarks
- We display the Aggregated Scores instead of the Average Scores

Files changed (5) hide show
  1. aggregated_scores.csv +22 -0
  2. app.py +91 -34
  3. css_html_js.py +4 -1
  4. parse.py +11 -2
  5. utils.py +32 -12
aggregated_scores.csv ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Agg S2R,Agg MC,Agg VerilogEval S2R,Agg VerilogEval MC,Agg RTLLM,Agg VeriGen
2
+ DeepSeek R1,74.84,75.51,77.01,77.81,68.06,54.4
3
+ Llama 3.1 405B,49.72,42.8,53.98,42.92,36.43,41.67
4
+ Llama 3.(1-3) 70B,39.0,38.49,38.64,37.45,40.12,48.05
5
+ Qwen2.5 72B,49.23,48.82,49.17,51.22,49.45,26.75
6
+ Qwen2.5 32B,50.58,40.73,50.53,41.85,50.71,30.46
7
+ StarChat2 15B v0.1,39.04,38.9,37.45,37.69,44.0,49.99
8
+ DeepSeek R1 Distill Qwen 14B,22.98,23.61,23.21,23.47,22.27,24.91
9
+ CodeLlama 70B,31.46,31.29,34.17,29.8,22.99,44.96
10
+ QwenCoder 2.5 32B,42.53,43.71,42.27,43.96,43.33,41.4
11
+ DeepSeek Coder 33B,25.71,36.47,19.49,37.25,45.11,29.29
12
+ QwenCoder 2.5 14B,36.75,38.49,35.61,39.03,40.33,33.55
13
+ OpenCoder 8B,31.13,34.76,27.12,34.55,43.63,36.67
14
+ QwenCoder 2.5 7B,13.86,32.31,6.31,31.75,37.41,37.47
15
+ "DeepSeek Coder 6,7B",31.6,30.03,28.69,30.41,40.67,26.61
16
+ RTLCoder Mistral,21.86,27.2,22.73,26.21,19.15,36.3
17
+ RTLCoder DeepSeek,32.21,37.6,31.75,37.47,33.64,38.81
18
+ OriGen,37.22,41.29,46.0,41.97,9.82,35.07
19
+ HaVen-CodeQwen,41.66,46.09,42.97,46.57,37.55,41.74
20
+ CodeV-CL-7B,28.19,35.7,25.75,35.39,35.79,38.53
21
+ CodeV-QW-7B,20.79,47.26,18.73,50.28,27.23,19.55
22
+ CodeV-DS-6.7B,18.19,44.1,14.28,47.05,30.39,17.03
app.py CHANGED
@@ -3,8 +3,8 @@ import pandas as pd
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  from css_html_js import custom_css, trigger_plot
6
- from parse import read_json, read_data
7
- from utils import model_hyperlink, filter_RTLRepo, filter_bench, filter_bench_all, handle_special_cases
8
  from typing import Union
9
  from about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
10
  import numpy as np
@@ -12,12 +12,26 @@ import plotly.graph_objects as go
12
  import plotly.express as px
13
  from gradio.themes.utils import colors
14
 
15
- def filter_leaderboard(benchmark, model_type, search_query, max_params):
16
  subset = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
17
  if benchmark != 'All':
18
  subset = df[df['Benchmark'] == benchmark]
 
 
19
  if model_type != 'All':
20
- model_type = model_type.split()[0]
21
  subset = subset[subset['Model Type'] == model_type]
22
  if search_query:
23
  subset = subset[subset['Model'].str.contains(search_query, case=False, na=False)]
@@ -25,11 +39,33 @@ def filter_leaderboard(benchmark, model_type, search_query, max_params):
25
  subset = subset[subset['Params'] <= max_params]
26
 
27
  if benchmark == 'All':
28
- return filter_bench_all(subset)
 
 
 
29
  elif benchmark == 'RTL-Repo':
30
  return filter_RTLRepo(subset)
31
  else:
32
- return filter_bench(subset)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def generate_scatter_plot(benchmark, metric):
35
  benchmark, metric = handle_special_cases(benchmark, metric)
@@ -50,7 +86,7 @@ def generate_scatter_plot(benchmark, metric):
50
  scatter_data['x'] = scatter_data['Params']
51
  scatter_data['y'] = scatter_data[metric]
52
  scatter_data['size'] = (scatter_data['x'] ** 0.3) * 40
53
-
54
  type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
55
  scatter_data['color'] = scatter_data['Model Type'].map(type_colors).fillna('gray')
56
 
@@ -65,7 +101,10 @@ def generate_scatter_plot(benchmark, metric):
65
  scatter_data, x='x', y='y', log_x=True, size='size', color='Model Type', text='Model',
66
  hover_data={metric: ':.2f'}, title=f'Params vs. {metric} for {benchmark}',
67
  labels={'x': '# Params (Log Scale)', 'y': metric}, template="plotly_white",
68
- # color_discrete_map={"General": "#A8D5BA", "Coding": "#F7DC6F", "RTL-Specific": "#87CEFA"},
 
 
 
69
  height=600, width=1200
70
  )
71
 
@@ -98,9 +137,14 @@ function refresh() {
98
 
99
  with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
100
  df, benchmarks, metrics, default_metric = read_data()
 
 
 
 
101
  rtl_metrics = ["Exact Matching (EM)"]
102
- non_rtl_metrics = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area"]
103
- # gr.Markdown("""# TuRTLe 🐢 Model Leaderboard""")
 
104
  gr.HTML("""
105
  <p align="center" style="margin-bottom: -10px;">
106
  <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
@@ -135,22 +179,32 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
135
  """)
136
  with gr.Tabs():
137
  with gr.Tab("Leaderboard"):
138
- with gr.Row():
139
- benchmark_radio = gr.Radio(choices=["All"] + benchmarks, label="Select Benchmark", value='VerilogEval S2R', scale=6)
140
- model_type_radio = gr.Radio(choices=['All', 'General 🟢', 'Coding 🔵', 'RTL-Specific 🔴'], label="Select Model Type", value='All', scale=4)
 
 
141
 
142
- with gr.Row():
143
- search_box = gr.Textbox(label="Search Model", placeholder="Type model name...")
144
- params_slider = gr.Slider(
145
- minimum=df['Params'].min(),
146
- maximum=700,
147
- value=700,
148
- label="Max Params",
149
- step=1
150
- )
 
 
 
 
 
 
 
 
151
 
152
  leaderboard = gr.DataFrame(
153
- value=filter_leaderboard('VerilogEval S2R', 'All', "", 700),
154
  headers="first row",
155
  show_row_numbers=True,
156
  wrap=True,
@@ -159,9 +213,9 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
159
  column_widths=["7%", "25%", "10%", "17%", "6%", "6%", "6%", "6%", "6%", "7%"]),
160
 
161
  with gr.Tab("Interactive Bubble Plot"):
162
- with gr.Row():
163
  bubble_benchmark = gr.Radio(choices=benchmarks, label="Select Benchmark", value='VerilogEval S2R')
164
- bubble_metric = gr.Radio(choices=non_rtl_metrics, label="Select Metric", value="Syntax (STX)")
165
  scatter_plot = gr.Plot(value=generate_scatter_plot('VerilogEval S2R', default_metric), label="Bubble Chart", elem_id="full-width-plot")
166
 
167
  with gr.Tab("About Us"):
@@ -202,22 +256,25 @@ with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=c
202
  )
203
 
204
  # event handlers, ugly way but it works
205
- benchmark_radio.change(fn=filter_leaderboard, inputs=[benchmark_radio, model_type_radio, search_box, params_slider], outputs=leaderboard)
206
- model_type_radio.change(fn=filter_leaderboard, inputs=[benchmark_radio, model_type_radio, search_box, params_slider], outputs=leaderboard)
207
- search_box.change(fn=filter_leaderboard, inputs=[benchmark_radio, model_type_radio, search_box, params_slider], outputs=leaderboard)
208
- params_slider.change(fn=filter_leaderboard, inputs=[benchmark_radio, model_type_radio, search_box, params_slider], outputs=leaderboard)
 
 
 
 
 
 
209
 
210
- # RTL-Repo Bubble plot handlres
211
  def on_benchmark_change(benchmark, _):
212
  if benchmark == "RTL-Repo":
213
  metric = "Exact Matching (EM)"
214
  return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(benchmark, metric)
215
  else:
216
  metric = non_rtl_metrics[0] # default to Syntax
217
- return gr.update(choices=non_rtl_metrics, value=metric), generate_scatter_plot(benchmark, metric)
218
- # benchmark, metric = handle_special_cases(benchmark, metric)
219
- # fig = generate_scatter_plot(benchmark, metric)
220
- # return gr.update(value=metric), fig
221
 
222
  def on_metric_change(benchmark, metric):
223
  benchmark, metric = handle_special_cases(benchmark, metric)
 
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  from css_html_js import custom_css, trigger_plot
6
+ from parse import read_json, read_data, parse_agg
7
+ from utils import model_hyperlink, filter_RTLRepo, filter_bench, filter_bench_all, handle_special_cases, type_emoji
8
  from typing import Union
9
  from about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
10
  import numpy as np
 
12
  import plotly.express as px
13
  from gradio.themes.utils import colors
14
 
15
+ def filter_leaderboard(task, benchmark, model_type, search_query, max_params):
16
  subset = df.copy()
17
+
18
+ # Filter by task-specific benchmarks when 'All' benchmarks is selected
19
+ if task == "Spec-to-RTL":
20
+ valid_benchmarks = s2r_benchs
21
+ if benchmark == 'All':
22
+ subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
23
+ elif task == "Code Completion":
24
+ valid_benchmarks = cc_benchs
25
+ if benchmark == 'All':
26
+ subset = subset[subset['Benchmark'].isin(valid_benchmarks)]
27
+
28
+ # Apply benchmark filter if not "All"
29
  if benchmark != 'All':
30
  subset = df[df['Benchmark'] == benchmark]
31
+
32
+ # Apply other filters
33
  if model_type != 'All':
34
+ # For dropdown without emojis
35
  subset = subset[subset['Model Type'] == model_type]
36
  if search_query:
37
  subset = subset[subset['Model'].str.contains(search_query, case=False, na=False)]
 
39
  subset = subset[subset['Params'] <= max_params]
40
 
41
  if benchmark == 'All':
42
+ if task == 'Spec-to-RTL':
43
+ return filter_bench_all(subset, df_agg, agg_column='Agg S2R')
44
+ elif task == 'Code Completion':
45
+ return filter_bench_all(subset, df_agg, agg_column='Agg MC')
46
  elif benchmark == 'RTL-Repo':
47
  return filter_RTLRepo(subset)
48
  else:
49
+ # Pass the specific benchmark aggregate column
50
+ agg_column = None
51
+ if benchmark == 'VerilogEval S2R':
52
+ agg_column = 'Agg VerilogEval S2R'
53
+ elif benchmark == 'VerilogEval MC':
54
+ agg_column = 'Agg VerilogEval MC'
55
+ elif benchmark == 'RTLLM':
56
+ agg_column = 'Agg RTLLM'
57
+ elif benchmark == 'VeriGen':
58
+ agg_column = 'Agg VeriGen'
59
+
60
+ return filter_bench(subset, df_agg, agg_column)
61
+
62
+ def update_benchmarks_by_task(task):
63
+ if task == "Spec-to-RTL":
64
+ return gr.update(choices=["All"] + s2r_benchs, value="All")
65
+ elif task == "Code Completion":
66
+ return gr.update(choices=["All"] + cc_benchs, value="All")
67
+ else:
68
+ return gr.update(choices=["All"] + benchmarks, value="All")
69
 
70
  def generate_scatter_plot(benchmark, metric):
71
  benchmark, metric = handle_special_cases(benchmark, metric)
 
86
  scatter_data['x'] = scatter_data['Params']
87
  scatter_data['y'] = scatter_data[metric]
88
  scatter_data['size'] = (scatter_data['x'] ** 0.3) * 40
89
+
90
  type_colors = {"General": "green", "Coding": "yellow", "RTL-Specific": "blue"}
91
  scatter_data['color'] = scatter_data['Model Type'].map(type_colors).fillna('gray')
92
 
 
101
  scatter_data, x='x', y='y', log_x=True, size='size', color='Model Type', text='Model',
102
  hover_data={metric: ':.2f'}, title=f'Params vs. {metric} for {benchmark}',
103
  labels={'x': '# Params (Log Scale)', 'y': metric}, template="plotly_white",
104
+ # color_discrete_map={"General": "
105
+ #A8D5BA", "Coding": "
106
+ #F7DC6F", "RTL-Specific": "
107
+ #87CEFA"},
108
  height=600, width=1200
109
  )
110
 
 
137
 
138
  with gr.Blocks(css=custom_css, js=js_func, theme=gr.themes.Default(primary_hue=colors.emerald)) as app:
139
  df, benchmarks, metrics, default_metric = read_data()
140
+ df_agg = parse_agg("./aggregated_scores.csv")
141
+ tasks = ["Spec-to-RTL", "Code Completion"]
142
+ s2r_benchs = ["VerilogEval S2R", "RTLLM"]
143
+ cc_benchs = ["VerilogEval MC", "VeriGen", "RTL-Repo"]
144
  rtl_metrics = ["Exact Matching (EM)"]
145
+ non_rtl_metrics = ["Syntax (STX)", "Functionality (FNC)", "Synthesis (SYN)", "Power", "Performance", "Area", "Aggregated ⬆️"]
146
+ model_types = ['All', 'General', 'Coding', 'RTL-Specific']
147
+
148
  gr.HTML("""
149
  <p align="center" style="margin-bottom: -10px;">
150
  <img src='/gradio_api/file=logo.png' alt='TuRTLe Logo' width='220'/> <br/>
 
179
  """)
180
  with gr.Tabs():
181
  with gr.Tab("Leaderboard"):
182
+ with gr.Row(equal_height=True):
183
+ with gr.Column(scale=1):
184
+ task_radio = gr.Radio(choices=tasks, label="Select Task", value='Spec-to-RTL')
185
+ with gr.Column(scale=1.75):
186
+ benchmark_radio = gr.Radio(choices=["All"] + s2r_benchs, label="Select Benchmark", value='All')
187
 
188
+ with gr.Row(equal_height=True):
189
+ with gr.Column(scale=1.9):
190
+ search_box = gr.Textbox(label="Search Model", placeholder="Type model name...")
191
+ with gr.Column(scale=1):
192
+ model_type_dropdown = gr.Dropdown(
193
+ choices=model_types,
194
+ label="Select Model Type",
195
+ value='All'
196
+ )
197
+ with gr.Column(scale=2):
198
+ params_slider = gr.Slider(
199
+ minimum=df['Params'].min(),
200
+ maximum=700,
201
+ value=700,
202
+ label="Max Params",
203
+ step=1
204
+ )
205
 
206
  leaderboard = gr.DataFrame(
207
+ value=filter_leaderboard('Spec-to-RTL', 'All', 'All', "", 700),
208
  headers="first row",
209
  show_row_numbers=True,
210
  wrap=True,
 
213
  column_widths=["7%", "25%", "10%", "17%", "6%", "6%", "6%", "6%", "6%", "7%"]),
214
 
215
  with gr.Tab("Interactive Bubble Plot"):
216
+ with gr.Row(equal_height=True):
217
  bubble_benchmark = gr.Radio(choices=benchmarks, label="Select Benchmark", value='VerilogEval S2R')
218
+ bubble_metric = gr.Radio(choices=non_rtl_metrics[:-1], label="Select Metric", value="Syntax (STX)")
219
  scatter_plot = gr.Plot(value=generate_scatter_plot('VerilogEval S2R', default_metric), label="Bubble Chart", elem_id="full-width-plot")
220
 
221
  with gr.Tab("About Us"):
 
256
  )
257
 
258
  # event handlers, ugly way but it works
259
+ task_radio.change(
260
+ fn=update_benchmarks_by_task,
261
+ inputs=[task_radio],
262
+ outputs=[benchmark_radio]
263
+ )
264
+ task_radio.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
265
+ benchmark_radio.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
266
+ model_type_dropdown.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
267
+ search_box.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
268
+ params_slider.change(fn=filter_leaderboard, inputs=[task_radio, benchmark_radio, model_type_dropdown, search_box, params_slider], outputs=leaderboard)
269
 
270
+ # RTL-Repo Bubble plot
271
  def on_benchmark_change(benchmark, _):
272
  if benchmark == "RTL-Repo":
273
  metric = "Exact Matching (EM)"
274
  return gr.update(choices=rtl_metrics, value=metric), generate_scatter_plot(benchmark, metric)
275
  else:
276
  metric = non_rtl_metrics[0] # default to Syntax
277
+ return gr.update(choices=non_rtl_metrics[:-1], value=metric), generate_scatter_plot(benchmark, metric)
 
 
 
278
 
279
  def on_metric_change(benchmark, metric):
280
  benchmark, metric = handle_special_cases(benchmark, metric)
css_html_js.py CHANGED
@@ -106,7 +106,10 @@ custom_css = """
106
  padding:0.5;
107
  }
108
  #box-filter > .form{
109
- border: 0
 
 
 
110
  }
111
  """
112
 
 
106
  padding:0.5;
107
  }
108
  #box-filter > .form{
109
+ border: 0;
110
+ }
111
+ .slider_input_container {
112
+ padding-top: 8px;
113
  }
114
  """
115
 
parse.py CHANGED
@@ -30,14 +30,16 @@ model_details = {
30
  "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific")
31
  }
32
 
33
- def get_headers(reader) -> Union[list, list]:
34
  metrics, benchs = [], []
35
  for i, row in enumerate(reader):
36
  if i == 0:
37
  metrics = row[1:]
38
- elif i == 1:
39
  benchs = row[1:]
40
  break
 
 
41
  return metrics, benchs
42
 
43
  def get_model_params_and_url(model) -> Union[str, str, float]:
@@ -80,6 +82,13 @@ def parse_results(csv_path: str) -> list[dict]:
80
  print(models)
81
  return dataset
82
 
 
 
 
 
 
 
 
83
  def writeJson(data: list):
84
  with open('results.json', 'w') as f:
85
  json.dump(data, f, indent=4, ensure_ascii=False)
 
30
  "OriGen": ("https://huggingface.co/henryen/OriGen_Fix", 6.74, "RTL-Specific")
31
  }
32
 
33
+ def get_headers(reader, agg=False) -> Union[list, list]:
34
  metrics, benchs = [], []
35
  for i, row in enumerate(reader):
36
  if i == 0:
37
  metrics = row[1:]
38
+ elif i == 1 and not agg:
39
  benchs = row[1:]
40
  break
41
+ else:
42
+ return metrics
43
  return metrics, benchs
44
 
45
  def get_model_params_and_url(model) -> Union[str, str, float]:
 
82
  print(models)
83
  return dataset
84
 
85
+ def parse_agg(csv_path: str) -> list[dict]:
86
+ """
87
+ Each row has the following format:
88
+ MODEL | BENCHMARK | TASK | METRIC | RESULT
89
+ """
90
+ return pd.read_csv("aggregated_scores.csv")
91
+
92
  def writeJson(data: list):
93
  with open('results.json', 'w') as f:
94
  json.dump(data, f, indent=4, ensure_ascii=False)
utils.py CHANGED
@@ -28,30 +28,51 @@ def filter_RTLRepo(subset: pd.DataFrame) -> pd.DataFrame:
28
  filtered_df['Type'] = filtered_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
29
  filtered_df = filtered_df[['Type', 'Model', 'Params', 'Exact Matching (EM)']]
30
  filtered_df = filtered_df.sort_values(by='Exact Matching (EM)', ascending=False).reset_index(drop=True)
31
- # filtered_df.insert(0, '', range(1, len(filtered_df) + 1))
32
  return filtered_df
33
 
34
- def filter_bench(subset: pd.DataFrame) -> pd.DataFrame:
35
  details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
36
  pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index()
37
- pivot_df['Average ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
 
 
 
 
 
 
38
  pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
39
  pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
40
  pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
41
  pivot_df.rename(columns={'Syntax (STX)': 'STX', 'Functionality (FNC)': 'FNC', 'Synthesis (SYN)': 'SYN', 'Performance': 'Perf'}, inplace=True)
42
- columns_order = ['Type', 'Model', 'Params', 'Average ⬆️', 'STX', 'FNC', 'SYN', 'Power', 'Perf', 'Area']
 
43
  pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
44
- pivot_df = pivot_df.sort_values(by='Average ⬆️', ascending=False).reset_index(drop=True)
45
- # pivot_df.insert(0, '', range(1, len(pivot_df) + 1))
46
  return pivot_df
47
 
48
- def filter_bench_all(subset: pd.DataFrame) -> pd.DataFrame:
49
  details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
50
  pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index().round(2)
51
- pivot_df['Average ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
53
  pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
54
  pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
 
55
  pivot_df.rename(columns={
56
  'Exact Matching (EM)': 'EM',
57
  'Syntax (STX)': 'Avg STX',
@@ -61,9 +82,8 @@ def filter_bench_all(subset: pd.DataFrame) -> pd.DataFrame:
61
  'Performance': 'Avg Perf',
62
  'Area': 'Avg Area',
63
  }, inplace=True)
64
- # columns_order = ['Type', 'Model', 'Params', 'Average ⬆️', 'Avg STX', 'Avg FNC', 'Avg SYN', 'Avg Power', 'Avg Perf', 'Avg Area']
65
- columns_order = ['Type', 'Model', 'Params', 'Average ⬆️', 'Avg STX', 'Avg FNC', 'Avg SYN', 'Avg Power', 'Avg Perf', 'Avg Area']
66
  pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
67
- pivot_df = pivot_df.sort_values(by='Average ⬆️', ascending=False).reset_index(drop=True)
68
- # pivot_df.insert(0, '', range(1, len(pivot_df) + 1))
69
  return pivot_df
 
28
  filtered_df['Type'] = filtered_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
29
  filtered_df = filtered_df[['Type', 'Model', 'Params', 'Exact Matching (EM)']]
30
  filtered_df = filtered_df.sort_values(by='Exact Matching (EM)', ascending=False).reset_index(drop=True)
 
31
  return filtered_df
32
 
33
+ def filter_bench(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
34
  details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
35
  pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index()
36
+
37
+ if df_agg is not None and agg_column is not None and agg_column in df_agg.columns:
38
+ agg_data = df_agg[['Model', agg_column]].rename(columns={agg_column: 'Aggregated ⬆️'})
39
+ pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
40
+ else:# fallback
41
+ pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
42
+
43
  pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
44
  pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
45
  pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
46
  pivot_df.rename(columns={'Syntax (STX)': 'STX', 'Functionality (FNC)': 'FNC', 'Synthesis (SYN)': 'SYN', 'Performance': 'Perf'}, inplace=True)
47
+
48
+ columns_order = ['Type', 'Model', 'Params', 'Aggregated ⬆️', 'STX', 'FNC', 'SYN', 'Power', 'Perf', 'Area']
49
  pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
50
+ pivot_df = pivot_df.sort_values(by='Aggregated ⬆️', ascending=False).reset_index(drop=True)
 
51
  return pivot_df
52
 
53
+ def filter_bench_all(subset: pd.DataFrame, df_agg=None, agg_column=None) -> pd.DataFrame:
54
  details = subset[['Model', 'Model URL', 'Model Type', 'Params']].drop_duplicates('Model')
55
  pivot_df = subset.pivot_table(index='Model', columns='Metric', values='Score', aggfunc='mean').reset_index().round(2)
56
+
57
+ if df_agg is not None:
58
+ if agg_column is not None and agg_column in df_agg.columns:
59
+ agg_data = df_agg[['Model', agg_column]].rename(columns={agg_column: 'Aggregated ⬆️'})
60
+ pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
61
+ else:
62
+ agg_columns = [col for col in df_agg.columns if col.startswith('Agg ')]
63
+ if agg_columns:
64
+ df_agg['Average_Agg'] = df_agg[agg_columns].mean(axis=1)
65
+ agg_data = df_agg[['Model', 'Average_Agg']].rename(columns={'Average_Agg': 'Aggregated ⬆️'})
66
+ pivot_df = pd.merge(pivot_df, agg_data, on='Model', how='left')
67
+ else: # fallback
68
+ pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
69
+ else: # fallback
70
+ pivot_df['Aggregated ⬆️'] = pivot_df.mean(axis=1, numeric_only=True).round(2)
71
+
72
  pivot_df = pd.merge(pivot_df, details, on='Model', how='left')
73
  pivot_df['Model'] = pivot_df.apply(lambda row: model_hyperlink(row["Model URL"], row["Model"]), axis=1)
74
  pivot_df['Type'] = pivot_df['Model Type'].map(lambda x: type_emoji.get(x, ""))
75
+
76
  pivot_df.rename(columns={
77
  'Exact Matching (EM)': 'EM',
78
  'Syntax (STX)': 'Avg STX',
 
82
  'Performance': 'Avg Perf',
83
  'Area': 'Avg Area',
84
  }, inplace=True)
85
+
86
+ columns_order = ['Type', 'Model', 'Params', 'Aggregated ⬆️', 'Avg STX', 'Avg FNC', 'Avg SYN', 'Avg Power', 'Avg Perf', 'Avg Area']
87
  pivot_df = pivot_df[[col for col in columns_order if col in pivot_df.columns]]
88
+ pivot_df = pivot_df.sort_values(by='Aggregated ⬆️', ascending=False).reset_index(drop=True)
 
89
  return pivot_df