juancauma commited on
Commit
91c704c
·
1 Parent(s): 9375c76

reverse the update, it broke everything

Browse files
Files changed (2) hide show
  1. app.py +38 -193
  2. requirements.txt +1 -2
app.py CHANGED
@@ -3,8 +3,6 @@ import pandas as pd
3
  import json
4
  import os
5
  from glob import glob
6
- import plotly.express as px
7
- import plotly.graph_objects as go
8
 
9
  def load_model_results():
10
  """Load all model results from JSON files in the submissions directory."""
@@ -34,63 +32,15 @@ def load_model_results():
34
 
35
  return results
36
 
37
- def create_metrics_plot(df):
38
- """Create a radar chart for the top 5 models."""
39
- top_5_df = df.head(5)
40
- metrics = ['Conversation Consistency', 'Backend Consistency', 'Policy Completeness']
41
-
42
- fig = go.Figure()
43
- for _, row in top_5_df.iterrows():
44
- fig.add_trace(go.Scatterpolar(
45
- r=[row[metric] for metric in metrics],
46
- theta=metrics,
47
- name=row['Model'],
48
- fill='toself'
49
- ))
50
-
51
- fig.update_layout(
52
- polar=dict(
53
- radialaxis=dict(
54
- visible=True,
55
- range=[0, 1]
56
- )),
57
- showlegend=True,
58
- title="Top 5 Models Performance Comparison"
59
- )
60
- return fig
61
-
62
- def create_bar_chart(df):
63
- """Create a bar chart comparing average scores."""
64
- fig = px.bar(
65
- df,
66
- x='Model',
67
- y='Average Score',
68
- color='Average Score',
69
- title="Model Performance Comparison",
70
- color_continuous_scale='viridis'
71
- )
72
- fig.update_layout(
73
- xaxis_tickangle=-45,
74
- height=400
75
- )
76
- return fig
77
-
78
- def create_leaderboard(sort_by="Average Score", ascending=False):
79
- """
80
- Create and format the leaderboard DataFrame.
81
-
82
- Args:
83
- sort_by (str): Column to sort by
84
- ascending (bool): Sort in ascending order if True, descending if False
85
- """
86
  results = load_model_results()
87
  if not results:
88
- empty_df = pd.DataFrame(columns=[
89
- "Rank", "Model", "Average Score", "Conversation Consistency",
90
- "Backend Consistency", "Policy Completeness",
91
- "Total Dialogues", "Total Turns", "Judge Model", "Judge Client"
92
  ])
93
- return empty_df, None, None
94
 
95
  df = pd.DataFrame(results)
96
  df = df.rename(columns={
@@ -104,158 +54,53 @@ def create_leaderboard(sort_by="Average Score", ascending=False):
104
  'judge_client': 'Judge Client'
105
  })
106
 
107
- # Calculate average score
108
- score_columns = ['Conversation Consistency', 'Backend Consistency', 'Policy Completeness']
109
- df['Average Score'] = df[score_columns].mean(axis=1)
110
 
111
  # Round floating point numbers to 4 decimal places
112
- float_columns = score_columns + ['Average Score']
113
  df[float_columns] = df[float_columns].round(4)
114
 
115
- # Sort by specified column
116
- df = df.sort_values(sort_by, ascending=ascending)
117
-
118
- # Add rank column
119
- df.insert(0, 'Rank', range(1, len(df) + 1))
120
-
121
- # Reorder columns
122
- columns = ['Rank', 'Model', 'Average Score'] + score_columns + [
123
- 'Total Dialogues', 'Total Turns', 'Judge Model', 'Judge Client'
124
- ]
125
- df = df[columns]
126
-
127
- # Create visualizations
128
- radar_chart = create_metrics_plot(df)
129
- bar_chart = create_bar_chart(df)
130
-
131
- return df, radar_chart, bar_chart
132
-
133
- def sort_leaderboard(sort_column):
134
- """Sort the leaderboard by the specified column."""
135
- current_sort = getattr(sort_leaderboard, 'current_sort', None)
136
- current_ascending = getattr(sort_leaderboard, 'current_ascending', False)
137
-
138
- if current_sort == sort_column:
139
- sort_leaderboard.current_ascending = not current_ascending
140
- else:
141
- sort_leaderboard.current_ascending = False
142
-
143
- sort_leaderboard.current_sort = sort_column
144
- return create_leaderboard(sort_column, sort_leaderboard.current_ascending)
145
 
146
- with gr.Blocks(
147
- title="Rome Leaderboard",
148
- theme=gr.themes.Soft(
149
- primary_hue="blue",
150
- secondary_hue="indigo",
151
- ),
152
- css="""
153
- .container {
154
- max-width: 1200px;
155
- margin: auto;
156
- }
157
- .title {
158
- text-align: center;
159
- margin-bottom: 1rem;
160
- }
161
- .metrics-description {
162
- background-color: #f8f9fa;
163
- padding: 1rem;
164
- border-radius: 8px;
165
- margin: 1rem 0;
166
- }
167
- .footer {
168
- text-align: center;
169
- margin-top: 2rem;
170
- padding: 1rem;
171
- background-color: #f8f9fa;
172
- border-radius: 8px;
173
- }
174
- .visualization {
175
- margin: 1rem 0;
176
- padding: 1rem;
177
- background-color: white;
178
- border-radius: 8px;
179
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
180
- }
181
- """
182
- ) as demo:
183
- with gr.Column(elem_classes="container"):
184
- gr.Markdown("# 🏆 Rome Model Evaluation Leaderboard", elem_classes="title")
185
-
186
- with gr.Box(elem_classes="metrics-description"):
187
- gr.Markdown("""
188
- ### Evaluation Metrics
189
- This leaderboard displays model performance across multiple evaluation metrics:
190
- - **Conversation Consistency**: Measures the consistency in model conversations
191
- - **Backend Consistency**: Evaluates backend interaction consistency
192
- - **Policy Completeness**: Assesses completion of policy requirements
193
-
194
- Models are ranked by their average score across these metrics. Click on any column header to sort by that metric.
195
- The **Average Score** column shows the mean of all performance metrics.
196
- """)
197
-
198
- with gr.Row():
199
- with gr.Column():
200
- radar_plot = gr.Plot(label="Top 5 Models Comparison", elem_classes="visualization")
201
- with gr.Column():
202
- bar_plot = gr.Plot(label="Overall Performance", elem_classes="visualization")
203
-
204
- with gr.Row():
205
- leaderboard_display = gr.DataFrame(
206
- headers=[
207
- "Rank", "Model", "Average Score", "Conversation Consistency",
208
- "Backend Consistency", "Policy Completeness", "Total Dialogues",
209
- "Total Turns", "Judge Model", "Judge Client"
210
- ],
211
- datatype=["number", "str", "number", "number", "number", "number",
212
- "number", "number", "str", "str"],
213
- label="Model Rankings",
214
- interactive=False,
215
- wrap=True,
216
- elem_classes="leaderboard"
217
- )
218
-
219
- with gr.Row(equal_height=True):
220
- refresh_btn = gr.Button("🔄 Refresh Leaderboard", size="lg", variant="primary")
221
- sort_options = gr.Dropdown(
222
- choices=[
223
- "Average Score", "Conversation Consistency",
224
- "Backend Consistency", "Policy Completeness",
225
- "Total Dialogues", "Total Turns"
226
- ],
227
- value="Average Score",
228
- label="Sort by",
229
- interactive=True,
230
- container=True
231
- )
232
-
233
- with gr.Box(elem_classes="footer"):
234
- gr.Markdown("""
235
- ### About the Leaderboard
236
- This leaderboard is updated automatically when new model evaluations are submitted.
237
- Refresh the page to see the latest results.
238
- """)
239
-
240
- # Handle sorting
241
- sort_options.change(
242
- fn=sort_leaderboard,
243
- inputs=[sort_options],
244
- outputs=[leaderboard_display, radar_plot, bar_plot]
245
  )
246
 
247
- # Handle refresh
248
  refresh_btn.click(
249
- fn=lambda: create_leaderboard(sort_options.value),
250
  inputs=None,
251
- outputs=[leaderboard_display, radar_plot, bar_plot]
252
  )
253
 
254
  # Display initial leaderboard
255
  demo.load(
256
  fn=create_leaderboard,
257
  inputs=None,
258
- outputs=[leaderboard_display, radar_plot, bar_plot]
259
  )
260
 
261
  if __name__ == "__main__":
 
3
  import json
4
  import os
5
  from glob import glob
 
 
6
 
7
  def load_model_results():
8
  """Load all model results from JSON files in the submissions directory."""
 
32
 
33
  return results
34
 
35
+ def create_leaderboard():
36
+ """Create and format the leaderboard DataFrame."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  results = load_model_results()
38
  if not results:
39
+ return pd.DataFrame(columns=[
40
+ "Model", "Conversation Consistency", "Backend Consistency",
41
+ "Policy Completeness", "Total Dialogues", "Total Turns",
42
+ "Judge Model", "Judge Client"
43
  ])
 
44
 
45
  df = pd.DataFrame(results)
46
  df = df.rename(columns={
 
54
  'judge_client': 'Judge Client'
55
  })
56
 
57
+ # Sort by average of the three main metrics
58
+ df['Average Score'] = df[['Conversation Consistency', 'Backend Consistency', 'Policy Completeness']].mean(axis=1)
59
+ df = df.sort_values('Average Score', ascending=False)
60
 
61
  # Round floating point numbers to 4 decimal places
62
+ float_columns = ['Conversation Consistency', 'Backend Consistency', 'Policy Completeness', 'Average Score']
63
  df[float_columns] = df[float_columns].round(4)
64
 
65
+ # Reorder columns to put Average Score after the main metrics
66
+ columns = ['Model', 'Conversation Consistency', 'Backend Consistency', 'Policy Completeness',
67
+ 'Average Score', 'Total Dialogues', 'Total Turns', 'Judge Model', 'Judge Client']
68
+ return df[columns]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
+ with gr.Blocks(title="Rome Leaderboard") as demo:
71
+ gr.Markdown("# 🏆 Rome Model Evaluation Leaderboard")
72
+ gr.Markdown("""
73
+ This leaderboard displays model performance across multiple evaluation metrics:
74
+ - **Conversation Consistency**: Measures the consistency in model conversations
75
+ - **Backend Consistency**: Evaluates backend interaction consistency
76
+ - **Policy Completeness**: Assesses completion of policy requirements
77
+
78
+ Models are ranked by their average score across these metrics.
79
+ """)
80
+
81
+ leaderboard_display = gr.DataFrame(
82
+ headers=[
83
+ "Model", "Conversation Consistency", "Backend Consistency",
84
+ "Policy Completeness", "Average Score", "Total Dialogues",
85
+ "Total Turns", "Judge Model", "Judge Client"
86
+ ],
87
+ datatype=["str", "number", "number", "number", "number",
88
+ "number", "number", "str", "str"],
89
+ label="Model Rankings"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
 
92
+ refresh_btn = gr.Button("🔄 Refresh Leaderboard")
93
  refresh_btn.click(
94
+ fn=create_leaderboard,
95
  inputs=None,
96
+ outputs=leaderboard_display
97
  )
98
 
99
  # Display initial leaderboard
100
  demo.load(
101
  fn=create_leaderboard,
102
  inputs=None,
103
+ outputs=leaderboard_display
104
  )
105
 
106
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,3 +1,2 @@
1
  gradio>=4.0.0
2
- pandas>=2.0.0
3
- plotly>=5.18.0
 
1
  gradio>=4.0.0
2
+ pandas>=2.0.0