edbeeching
		
	commited on
		
		
					Commit 
							
							·
						
						fcb01e3
	
1
								Parent(s):
							
							b2c063a
								
updates table to include revision
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -46,8 +46,8 @@ def load_results(model, benchmark, metric): 
     | 
|
| 46 | 
         
             
                return mean_acc, data["config"]["model_args"]
         
     | 
| 47 | 
         | 
| 48 | 
         | 
| 49 | 
         
            -
            COLS = [" 
     | 
| 50 | 
         
            -
            TYPES = ["str", 
     | 
| 51 | 
         | 
| 52 | 
         
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         
     | 
| 53 | 
         
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         
     | 
| 
         @@ -59,7 +59,7 @@ def get_leaderboard(): 
     | 
|
| 59 | 
         
             
                all_data = get_eval_results_dicts()
         
     | 
| 60 | 
         
             
                dataframe = pd.DataFrame.from_records(all_data)
         
     | 
| 61 | 
         
             
                dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
         
     | 
| 62 | 
         
            -
                
         
     | 
| 63 | 
         
             
                dataframe = dataframe[COLS]
         
     | 
| 64 | 
         
             
                return dataframe
         
     | 
| 65 | 
         | 
| 
         | 
|
| 46 | 
         
             
                return mean_acc, data["config"]["model_args"]
         
     | 
| 47 | 
         | 
| 48 | 
         | 
| 49 | 
         
            +
            COLS = ["base_model", "revision", "8bit", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
         
     | 
| 50 | 
         
            +
            TYPES = ["markdown","str", "bool", "number", "number", "number", "number", "number", ]
         
     | 
| 51 | 
         | 
| 52 | 
         
             
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         
     | 
| 53 | 
         
             
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         
     | 
| 
         | 
|
| 59 | 
         
             
                all_data = get_eval_results_dicts()
         
     | 
| 60 | 
         
             
                dataframe = pd.DataFrame.from_records(all_data)
         
     | 
| 61 | 
         
             
                dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
         
     | 
| 62 | 
         
            +
                print(dataframe)
         
     | 
| 63 | 
         
             
                dataframe = dataframe[COLS]
         
     | 
| 64 | 
         
             
                return dataframe
         
     | 
| 65 | 
         | 
    	
        utils.py
    CHANGED
    
    | 
         @@ -50,6 +50,7 @@ class EvalResult: 
     | 
|
| 50 | 
         
             
                eval_name : str
         
     | 
| 51 | 
         
             
                org : str
         
     | 
| 52 | 
         
             
                model : str
         
     | 
| 
         | 
|
| 53 | 
         
             
                is_8bit : bool
         
     | 
| 54 | 
         
             
                results : dict
         
     | 
| 55 | 
         | 
| 
         @@ -60,8 +61,11 @@ class EvalResult: 
     | 
|
| 60 | 
         
             
                    else:
         
     | 
| 61 | 
         
             
                        base_model =f"{self.model}"
         
     | 
| 62 | 
         
             
                    data_dict = {}
         
     | 
| 
         | 
|
| 63 | 
         
             
                    data_dict["eval_name"] = self.eval_name
         
     | 
| 
         | 
|
| 64 | 
         
             
                    data_dict["base_model"] = make_clickable_model(base_model)
         
     | 
| 
         | 
|
| 65 | 
         
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         
     | 
| 66 | 
         
             
                    data_dict["# params"] = get_n_params(base_model)
         
     | 
| 67 | 
         | 
| 
         @@ -83,21 +87,22 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: 
     | 
|
| 83 | 
         | 
| 84 | 
         
             
                path_split = json_filepath.split("/")
         
     | 
| 85 | 
         
             
                org = None
         
     | 
| 86 | 
         
            -
                model = path_split[- 
     | 
| 87 | 
         
             
                is_8bit = path_split[-2] == "8bit"
         
     | 
| 88 | 
         
            -
                 
     | 
| 
         | 
|
| 89 | 
         
             
                    # handles gpt2 type models that don't have an org
         
     | 
| 90 | 
         
            -
                    result_key = f"{path_split[-3]}_{path_split[-2]}"
         
     | 
| 91 | 
         
            -
                else:
         
     | 
| 92 | 
         
             
                    result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         
     | 
| 93 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 94 | 
         | 
| 95 | 
         
             
                eval_result = None
         
     | 
| 96 | 
         
             
                for benchmark, metric  in zip(BENCHMARKS, METRICS):
         
     | 
| 97 | 
         
             
                    if benchmark in json_filepath:
         
     | 
| 98 | 
         
             
                        accs = np.array([v[metric] for k, v in data["results"].items()])
         
     | 
| 99 | 
         
             
                        mean_acc = round(np.mean(accs),3)
         
     | 
| 100 | 
         
            -
                        eval_result = EvalResult(result_key, org, model, is_8bit, {benchmark:mean_acc})
         
     | 
| 101 | 
         | 
| 102 | 
         
             
                return result_key, eval_result
         
     | 
| 103 | 
         | 
| 
         | 
|
| 50 | 
         
             
                eval_name : str
         
     | 
| 51 | 
         
             
                org : str
         
     | 
| 52 | 
         
             
                model : str
         
     | 
| 53 | 
         
            +
                revision : str
         
     | 
| 54 | 
         
             
                is_8bit : bool
         
     | 
| 55 | 
         
             
                results : dict
         
     | 
| 56 | 
         | 
| 
         | 
|
| 61 | 
         
             
                    else:
         
     | 
| 62 | 
         
             
                        base_model =f"{self.model}"
         
     | 
| 63 | 
         
             
                    data_dict = {}
         
     | 
| 64 | 
         
            +
                    
         
     | 
| 65 | 
         
             
                    data_dict["eval_name"] = self.eval_name
         
     | 
| 66 | 
         
            +
                    data_dict["8bit"] = self.is_8bit
         
     | 
| 67 | 
         
             
                    data_dict["base_model"] = make_clickable_model(base_model)
         
     | 
| 68 | 
         
            +
                    data_dict["revision"] = self.revision
         
     | 
| 69 | 
         
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         
     | 
| 70 | 
         
             
                    data_dict["# params"] = get_n_params(base_model)
         
     | 
| 71 | 
         | 
| 
         | 
|
| 87 | 
         | 
| 88 | 
         
             
                path_split = json_filepath.split("/")
         
     | 
| 89 | 
         
             
                org = None
         
     | 
| 90 | 
         
            +
                model = path_split[-4]
         
     | 
| 91 | 
         
             
                is_8bit = path_split[-2] == "8bit"
         
     | 
| 92 | 
         
            +
                revision = path_split[-3]
         
     | 
| 93 | 
         
            +
                if len(path_split)== 6:
         
     | 
| 94 | 
         
             
                    # handles gpt2 type models that don't have an org
         
     | 
| 
         | 
|
| 
         | 
|
| 95 | 
         
             
                    result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         
     | 
| 96 | 
         
            +
                else:
         
     | 
| 97 | 
         
            +
                    result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         
     | 
| 98 | 
         
            +
                    org = path_split[-5]
         
     | 
| 99 | 
         | 
| 100 | 
         
             
                eval_result = None
         
     | 
| 101 | 
         
             
                for benchmark, metric  in zip(BENCHMARKS, METRICS):
         
     | 
| 102 | 
         
             
                    if benchmark in json_filepath:
         
     | 
| 103 | 
         
             
                        accs = np.array([v[metric] for k, v in data["results"].items()])
         
     | 
| 104 | 
         
             
                        mean_acc = round(np.mean(accs),3)
         
     | 
| 105 | 
         
            +
                        eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
         
     | 
| 106 | 
         | 
| 107 | 
         
             
                return result_key, eval_result
         
     | 
| 108 | 
         |