Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	keep old Average
Browse files- app.py +1 -1
- src/display/utils.py +1 -0
- src/leaderboard/read_evals.py +7 -1
    	
        app.py
    CHANGED
    
    | @@ -76,7 +76,7 @@ def style_df(df: pd.DataFrame) -> Styler: | |
| 76 | 
             
                rounding = {'#Params (B)': "{:.1f}"}
         | 
| 77 | 
             
                for task in Tasks:
         | 
| 78 | 
             
                    rounding[task.value.col_name] = "{:.2f}"
         | 
| 79 | 
            -
                for column_name in ["Average ⬆️", "Avg g", "Avg mc"]:
         | 
| 80 | 
             
                    rounding[column_name] = "{:.2f}"
         | 
| 81 | 
             
                leaderboard_df_styled = leaderboard_df_styled.format(rounding)
         | 
| 82 | 
             
                return leaderboard_df_styled
         | 
|  | |
| 76 | 
             
                rounding = {'#Params (B)': "{:.1f}"}
         | 
| 77 | 
             
                for task in Tasks:
         | 
| 78 | 
             
                    rounding[task.value.col_name] = "{:.2f}"
         | 
| 79 | 
            +
                for column_name in ["Average ⬆️", "Avg g", "Avg mc", "Average old"]:
         | 
| 80 | 
             
                    rounding[column_name] = "{:.2f}"
         | 
| 81 | 
             
                leaderboard_df_styled = leaderboard_df_styled.format(rounding)
         | 
| 82 | 
             
                return leaderboard_df_styled
         | 
    	
        src/display/utils.py
    CHANGED
    
    | @@ -30,6 +30,7 @@ auto_eval_column_dict.append(["lang", ColumnContent, ColumnContent("Lang", "str" | |
| 30 | 
             
            auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
         | 
| 31 | 
             
            #Scores
         | 
| 32 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
|  | |
| 33 | 
             
            auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
         | 
| 34 | 
             
            auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
         | 
| 35 | 
             
            for task in Tasks:
         | 
|  | |
| 30 | 
             
            auto_eval_column_dict.append(["n_shot", ColumnContent, ColumnContent("n_shot", "str", True)])
         | 
| 31 | 
             
            #Scores
         | 
| 32 | 
             
            auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
         | 
| 33 | 
            +
            auto_eval_column_dict.append(["average_old", ColumnContent, ColumnContent("Average old", "number", False)])
         | 
| 34 | 
             
            auto_eval_column_dict.append(["average_g", ColumnContent, ColumnContent("Avg g", "number", True)])
         | 
| 35 | 
             
            auto_eval_column_dict.append(["average_mc", ColumnContent, ColumnContent("Avg mc", "number", True)])
         | 
| 36 | 
             
            for task in Tasks:
         | 
    	
        src/leaderboard/read_evals.py
    CHANGED
    
    | @@ -157,10 +157,11 @@ class EvalResult: | |
| 157 | 
             
                    g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
         | 
| 158 | 
             
                    mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
         | 
| 159 | 
             
                    all_tasks = g_tasks + mc_tasks
         | 
|  | |
| 160 |  | 
| 161 | 
             
                    baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
         | 
| 162 |  | 
| 163 | 
            -
                     | 
| 164 | 
             
                    # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
         | 
| 165 | 
             
                    # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
         | 
| 166 | 
             
                    # print('XXXXXXXXXXXX')
         | 
| @@ -249,6 +250,11 @@ class EvalResult: | |
| 249 | 
             
                    except AttributeError:
         | 
| 250 | 
             
                        print(f"AttributeError revision")
         | 
| 251 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 252 | 
             
                    try:
         | 
| 253 | 
             
                        data_dict[AutoEvalColumn.average.name] = average
         | 
| 254 | 
             
                    except KeyError:
         | 
|  | |
| 157 | 
             
                    g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"]
         | 
| 158 | 
             
                    mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"]
         | 
| 159 | 
             
                    all_tasks = g_tasks + mc_tasks
         | 
| 160 | 
            +
                    all_tasks_wo_polqa = [task for task in all_tasks if 'polqa' not in task]
         | 
| 161 |  | 
| 162 | 
             
                    baselines = {task.value.benchmark: task.value.baseline*100 for task in Tasks}
         | 
| 163 |  | 
| 164 | 
            +
                    average_old = sum([v for task, v in self.results.items() if v is not None and task in all_tasks_wo_polqa]) / len(all_tasks_wo_polqa)
         | 
| 165 | 
             
                    # average_g = sum([v for task, v in self.results.items() if v is not None and task in g_tasks]) / len(g_tasks)
         | 
| 166 | 
             
                    # average_mc = sum([v for task, v in self.results.items() if v is not None and task in mc_tasks]) / len(mc_tasks)
         | 
| 167 | 
             
                    # print('XXXXXXXXXXXX')
         | 
|  | |
| 250 | 
             
                    except AttributeError:
         | 
| 251 | 
             
                        print(f"AttributeError revision")
         | 
| 252 |  | 
| 253 | 
            +
                    try:
         | 
| 254 | 
            +
                        data_dict[AutoEvalColumn.average_old.name] = average_old
         | 
| 255 | 
            +
                    except KeyError:
         | 
| 256 | 
            +
                        print(f"Could not find average_old")
         | 
| 257 | 
            +
             | 
| 258 | 
             
                    try:
         | 
| 259 | 
             
                        data_dict[AutoEvalColumn.average.name] = average
         | 
| 260 | 
             
                    except KeyError:
         | 
 
			
