rzanoli commited on
Commit
5888550
·
1 Parent(s): 338193d

Small changes

Browse files
Files changed (3) hide show
  1. app.py +48 -6
  2. src/display/utils.py +5 -4
  3. src/leaderboard/read_evals.py +18 -14
app.py CHANGED
@@ -10,6 +10,7 @@ from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoE
10
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
  from src.submission.submit import add_new_eval
 
13
 
14
  # Define task metadata (icons, names, descriptions)
15
  TASK_METADATA_MULTIPLECHOICE = {
@@ -35,7 +36,10 @@ def restart_space():
35
 
36
 
37
  def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
38
- """Initialize and return a leaderboard."""
 
 
 
39
  if dataframe is None or dataframe.empty:
40
  raise ValueError("Leaderboard DataFrame is empty or None.")
41
 
@@ -52,13 +56,50 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
52
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
53
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
54
  filter_columns=[
55
- ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)")
 
 
56
  # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
57
  ],
58
- bool_checkboxgroup_label="Hide models",
 
 
 
 
59
  interactive=False,
60
  )
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  '''
63
  # Helper function for leaderboard initialization
64
  def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
@@ -137,7 +178,7 @@ with demo:
137
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
138
  gr.Markdown(task_description, elem_classes="markdown-text")
139
 
140
- leaderboard = init_leaderboard(
141
  LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
142
  default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
143
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
@@ -153,7 +194,7 @@ with demo:
153
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
154
  gr.Markdown(task_description, elem_classes="markdown-text")
155
 
156
- leaderboard = init_leaderboard(
157
  LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
158
  f"{task} Best Prompt": "Best Prompt",
159
  f"{task} Best Prompt Id": "Best Prompt Id",
@@ -175,4 +216,5 @@ scheduler.add_job(restart_space, "interval", seconds=1800)
175
  scheduler.start()
176
 
177
  # Launch the app with concurrent queueing
178
- demo.queue(default_concurrency_limit=40).launch()
 
 
10
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
  from src.submission.submit import add_new_eval
13
+ import random
14
 
15
  # Define task metadata (icons, names, descriptions)
16
  TASK_METADATA_MULTIPLECHOICE = {
 
36
 
37
 
38
  def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
39
+ """
40
+ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected.
41
+ The table is sorted based on the "Avg. Combined Performance" field.
42
+ """
43
  if dataframe is None or dataframe.empty:
44
  raise ValueError("Leaderboard DataFrame is empty or None.")
45
 
 
56
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
57
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
58
  filter_columns=[
59
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
60
+ #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)",
61
+ # default=[["0️⃣", "0️⃣"]]),
62
  # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
63
  ],
64
+ #filter_columns=[
65
+ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot")
66
+ # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot")
67
+ #],
68
+ bool_checkboxgroup_label="Evaluation Mode",
69
  interactive=False,
70
  )
71
 
72
+ def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None):
73
+ """
74
+ Update and return the leaderboard when a specific task is selected.
75
+ The table is sorted based on the "Combined Performance" field.
76
+ """
77
+ if dataframe is None or dataframe.empty:
78
+ raise ValueError("Leaderboard DataFrame is empty or None.")
79
+
80
+ sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False)
81
+
82
+ #print(sorted_dataframe['Combined Performance'])
83
+
84
+ field_list = fields(AutoEvalColumn)
85
+
86
+ return Leaderboard(
87
+ value=sorted_dataframe,
88
+ datatype=[c.type for c in field_list],
89
+ #select_columns=SelectColumns(
90
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
91
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
92
+ # label="Select Columns to Display:",
93
+ #),
94
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
95
+ hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
96
+ filter_columns=[
97
+ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
98
+ ],
99
+ bool_checkboxgroup_label="Evaluation Mode",
100
+ interactive=False
101
+ )
102
+
103
  '''
104
  # Helper function for leaderboard initialization
105
  def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
 
178
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
179
  gr.Markdown(task_description, elem_classes="markdown-text")
180
 
181
+ leaderboard = update_task_leaderboard(
182
  LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
183
  default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
184
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
 
194
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
195
  gr.Markdown(task_description, elem_classes="markdown-text")
196
 
197
+ leaderboard = update_task_leaderboard(
198
  LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
199
  f"{task} Best Prompt": "Best Prompt",
200
  f"{task} Best Prompt Id": "Best Prompt Id",
 
216
  scheduler.start()
217
 
218
  # Launch the app with concurrent queueing
219
+ demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode
220
+ show_error=True)
src/display/utils.py CHANGED
@@ -25,7 +25,8 @@ auto_eval_column_dict = []
25
  # Init
26
  #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
 
28
- auto_eval_column_dict.append(["fewshot_type", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
 
29
 
30
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
31
  #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
@@ -103,11 +104,11 @@ class FewShotType(Enum):
103
  return f"{self.value.symbol}{separator}{self.value.name}"
104
 
105
  @staticmethod
106
- def from_num_fewshot(num_fewshot):
107
  """Determines FewShotType based on num_fewshot."""
108
- if num_fewshot == 0:
109
  return FewShotType.ZS
110
- if num_fewshot == 5:
111
  return FewShotType.FS
112
  return FewShotType.Unknown
113
 
 
25
  # Init
26
  #auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
 
28
+ auto_eval_column_dict.append(["fewshot_symbol", ColumnContent, ColumnContent("FS", "str", True, never_hidden=True)])
29
+ auto_eval_column_dict.append(["is_5fewshot", ColumnContent, ColumnContent("IS_FS", "bool", True)])
30
 
31
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
  #auto_eval_column_dict.append(["fewshot", ColumnContent, ColumnContent("Few-Shot", "str", True)])
 
104
  return f"{self.value.symbol}{separator}{self.value.name}"
105
 
106
  @staticmethod
107
+ def from_num_fewshot(is_5fewshot):
108
  """Determines FewShotType based on num_fewshot."""
109
+ if is_5fewshot is False:
110
  return FewShotType.ZS
111
+ elif is_5fewshot is True:
112
  return FewShotType.FS
113
  return FewShotType.Unknown
114
 
src/leaderboard/read_evals.py CHANGED
@@ -25,8 +25,8 @@ class EvalResult:
25
  revision: str # commit hash, "" if main
26
  results: Dict[str, Union[float, int]] # float o int
27
  average_CPS: float
28
- fewshot: int
29
- fewshot_type: FewShotType = FewShotType.Unknown
30
  weight_type: WeightType = WeightType.Original # Original or Adapter
31
  architecture: str = "Unknown"
32
  license: str = "?"
@@ -47,13 +47,17 @@ class EvalResult:
47
  # Ottieni average_CPS come float
48
  average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
49
 
50
- num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
 
51
  try:
52
- num_fewshot = int(num_fewshot) # Converte in intero se possibile
 
 
 
53
  except ValueError:
54
- num_fewshot = 0 # Se la conversione fallisce, assegna 0
55
  # Determine the few-shot type (ZS or FS) based on num_fewshot
56
- fewshot_type = FewShotType.from_num_fewshot(num_fewshot) # Use the new
57
 
58
  num_params = int(0)
59
  num_params_billion = config.get("num_params_billion")
@@ -68,12 +72,12 @@ class EvalResult:
68
  org = None
69
  model = org_and_model[0]
70
  #result_key = f"{model}_{precision.value.name}"
71
- result_key = f"{model}_{num_fewshot}"
72
  else:
73
  org = org_and_model[0]
74
  model = org_and_model[1]
75
  #result_key = f"{org}_{model}_{precision.value.name}"
76
- result_key = f"{org}_{model}_{num_fewshot}"
77
  full_model = "/".join(org_and_model)
78
 
79
  still_on_hub, _, model_config = is_model_on_hub(
@@ -107,8 +111,8 @@ class EvalResult:
107
  model=model,
108
  results=results,
109
  average_CPS=average_CPS,
110
- fewshot_type=fewshot_type,
111
- fewshot=num_fewshot,
112
  revision= config.get("model_sha", ""),
113
  still_on_hub=still_on_hub,
114
  architecture=architecture,
@@ -137,8 +141,8 @@ class EvalResult:
137
  """Converts the Eval Result to a dict compatible with our dataframe display"""
138
  average = self.average_CPS
139
 
140
- fewshot_type_symbol = (
141
- self.fewshot_type.value.symbol if isinstance(self.fewshot_type, FewShotType) else "❓"
142
  )
143
 
144
  data_dict = {
@@ -148,13 +152,13 @@ class EvalResult:
148
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
149
  #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
150
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
151
- AutoEvalColumn.fewshot_type.name: fewshot_type_symbol, # Simbolo corretto per fewshot type
152
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
153
  AutoEvalColumn.architecture.name: self.architecture,
154
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
155
  AutoEvalColumn.revision.name: self.revision,
156
  AutoEvalColumn.average.name: average,
157
- #AutoEvalColumn.fewshot.name: fewshot,
158
  AutoEvalColumn.license.name: self.license,
159
  AutoEvalColumn.likes.name: self.likes,
160
  AutoEvalColumn.params.name: self.num_params,
 
25
  revision: str # commit hash, "" if main
26
  results: Dict[str, Union[float, int]] # float o int
27
  average_CPS: float
28
+ is_5fewshot: bool
29
+ fewshot_symbol: FewShotType = FewShotType.Unknown
30
  weight_type: WeightType = WeightType.Original # Original or Adapter
31
  architecture: str = "Unknown"
32
  license: str = "?"
 
47
  # Ottieni average_CPS come float
48
  average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
49
 
50
+ fewshot = config.get("num_fewshot", False) # Imposta il valore predefinito a 0
51
+
52
  try:
53
+ if fewshot == "5":
54
+ is_5fewshot = True
55
+ else:
56
+ is_5fewshot = False# Converte in intero se possibile
57
  except ValueError:
58
+ is_5fewshot = False # Se la conversione fallisce, assegna 0
59
  # Determine the few-shot type (ZS or FS) based on num_fewshot
60
+ fewshot_symbol = FewShotType.from_num_fewshot(is_5fewshot) # Use the new
61
 
62
  num_params = int(0)
63
  num_params_billion = config.get("num_params_billion")
 
72
  org = None
73
  model = org_and_model[0]
74
  #result_key = f"{model}_{precision.value.name}"
75
+ result_key = f"{model}_{is_5fewshot}"
76
  else:
77
  org = org_and_model[0]
78
  model = org_and_model[1]
79
  #result_key = f"{org}_{model}_{precision.value.name}"
80
+ result_key = f"{org}_{model}_{is_5fewshot}"
81
  full_model = "/".join(org_and_model)
82
 
83
  still_on_hub, _, model_config = is_model_on_hub(
 
111
  model=model,
112
  results=results,
113
  average_CPS=average_CPS,
114
+ fewshot_symbol=fewshot_symbol,
115
+ is_5fewshot=is_5fewshot,
116
  revision= config.get("model_sha", ""),
117
  still_on_hub=still_on_hub,
118
  architecture=architecture,
 
141
  """Converts the Eval Result to a dict compatible with our dataframe display"""
142
  average = self.average_CPS
143
 
144
+ fewshot_symbol = (
145
+ self.fewshot_symbol.value.symbol if isinstance(self.fewshot_symbol, FewShotType) else "❓"
146
  )
147
 
148
  data_dict = {
 
152
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
153
  #AutoEvalColumn.model_type.name: self.model_type.value.name if self.model_type else "Unknown",
154
  #AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol if self.model_type else "Unknown",
155
+ AutoEvalColumn.fewshot_symbol.name: fewshot_symbol,
156
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
157
  AutoEvalColumn.architecture.name: self.architecture,
158
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
159
  AutoEvalColumn.revision.name: self.revision,
160
  AutoEvalColumn.average.name: average,
161
+ AutoEvalColumn.is_5fewshot.name: self.is_5fewshot,
162
  AutoEvalColumn.license.name: self.license,
163
  AutoEvalColumn.likes.name: self.likes,
164
  AutoEvalColumn.params.name: self.num_params,