Spaces:

McGill-NLP
/

msteb_leaderboard

Running

App Files Files Community

vivekvermaiit commited on Jul 17

Commit

89c30b1

1 Parent(s): 6d848c3

regions

Browse files

Files changed (3) hide show

app.py +27 -1
src/leaderboard/read_evals.py +28 -5
src/populate.py +8 -4

app.py CHANGED Viewed

@@ -89,6 +89,21 @@ def init_leaderboard(dataframe):
         interactive=False,
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
@@ -97,7 +112,18 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 mSTEB Text Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

         interactive=False,
     )
+region_dropdown = gr.Dropdown(
+    choices=["All", "region_1", "region_2"],  # Add all available regions
+    label="Select Region",
+    value="All",
+    interactive=True,
+)
+# Initialize the leaderboard with the default region ("All")
+leaderboard_table = gr.Dataframe(
+    value=LEADERBOARD_DF,
+    headers=COLS,
+    datatype=[c.type for c in fields(AutoEvalColumn)],
+    row_count=5,
+)
 demo = gr.Blocks(css=custom_css)
 with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 mSTEB Text Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+            # leaderboard = init_leaderboard(LEADERBOARD_DF)
+            with gr.Row():
+                region_dropdown.render()  # Render the dropdown for region selection
+            leaderboard_table.render()  # Render the leaderboard table
+            # Update leaderboard dynamically based on region selection
+            region_dropdown.change(
+                lambda region: LEADERBOARD_DF if region == "All" else get_leaderboard_df(EVAL_RESULTS_PATH,
+                                                                                         EVAL_REQUESTS_PATH, COLS,
+                                                                                         BENCHMARK_COLS, region),
+                inputs=[region_dropdown],
+                outputs=[leaderboard_table],
+            )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -31,6 +31,7 @@ class EvalResult:
     num_params: int = 0
     date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -39,6 +40,7 @@ class EvalResult:
             data = json.load(fp)
         config = data.get("config")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
@@ -78,6 +80,21 @@ class EvalResult:
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
             full_model=full_model,
@@ -87,7 +104,8 @@ class EvalResult:
             precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):
@@ -106,13 +124,14 @@ class EvalResult:
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         # print(self.results)
         acc_values = [
-            self.results[task.value.benchmark]
             for task in Tasks
-            if task.value.metric == "acc" and task.value.benchmark in self.results
         ]
         # print(acc_values)
@@ -136,7 +155,7 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -185,6 +204,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
@@ -201,5 +222,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             results.append(v)
         except KeyError:  # not all eval values present
             continue
     return results

     num_params: int = 0
     date: str = "" # submission date of request file
     still_on_hub: bool = False
+    regions: dict = None
     @classmethod
     def init_from_json_file(self, json_filepath):
             data = json.load(fp)
         config = data.get("config")
+        regions = data.get("regions", {})  # Parse regions from JSON
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
             mean_acc = np.mean(accs) * 100.0
             results[task.benchmark] = mean_acc
+        regions_processed_results = {}
+        for region, region_results in regions.items():
+            processed = {}
+            for task in Tasks:
+                task = task.value
+                # We average all scores of a given metric (not all metrics are present in all files)
+                accs = np.array([v.get(task.metric, None) for k, v in region_results.items() if task.benchmark == k])
+                if accs.size == 0 or any([acc is None for acc in accs]):
+                    continue
+                mean_acc = np.mean(accs) * 100.0
+                processed[task.benchmark] = mean_acc
+            regions_processed_results[region] = processed
         return self(
             eval_name=result_key,
             full_model=full_model,
             precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
+            architecture=architecture,
+            regions=regions_processed_results
         )
     def update_with_request_file(self, requests_path):
         except Exception:
             print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
+    def to_dict(self, region=None):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         # print(self.results)
+        results = self.results if region is None else self.regions.get(region, {})
         acc_values = [
+            results[task.value.benchmark]
             for task in Tasks
+            if task.value.metric == "acc" and task.value.benchmark in results
         ]
         # print(acc_values)
         }
         for task in Tasks:
+            data_dict[task.value.col_name] = results[task.value.benchmark]
         return data_dict
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        print('testing this one')
+        print(eval_result)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
             results.append(v)
         except KeyError:  # not all eval values present
             continue
+    print('results')
+    print(results)
     return results

src/populate.py CHANGED Viewed

@@ -8,17 +8,21 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
-    all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df

 from src.leaderboard.read_evals import get_raw_eval_results
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, region=None) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    # this here if region is none gets main results. I have to pass region value here to get region based results
+    # and they should come.
+    all_data_json = [v.to_dict(region) for v in raw_data]
+    print('all_data_json', all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
+    print('df', df)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
+    print('df after sorting', df)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    print('df after filtering', df)
     return df