Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	
		edbeeching
		
	commited on
		
		
					Commit 
							
							·
						
						b2c063a
	
1
								Parent(s):
							
							59c748f
								
adds revision option
Browse files- .gitignore +2 -1
- app.py +23 -52
- utils.py +24 -16
    	
        .gitignore
    CHANGED
    
    | @@ -1,2 +1,3 @@ | |
| 1 | 
             
            evals/
         | 
| 2 | 
            -
            venv/
         | 
|  | 
|  | |
| 1 | 
             
            evals/
         | 
| 2 | 
            +
            venv/
         | 
| 3 | 
            +
            __pycache__/
         | 
    	
        app.py
    CHANGED
    
    | @@ -8,7 +8,7 @@ import json | |
| 8 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 9 | 
             
            import pandas as pd
         | 
| 10 | 
             
            import datetime
         | 
| 11 | 
            -
            from utils import get_eval_results_dicts, make_clickable_model
         | 
| 12 |  | 
| 13 | 
             
            # clone / pull the lmeh eval data
         | 
| 14 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| @@ -45,53 +45,16 @@ def load_results(model, benchmark, metric): | |
| 45 | 
             
                mean_acc = np.mean(accs)  
         | 
| 46 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 47 |  | 
| 48 | 
            -
            def get_n_params(base_model):
         | 
| 49 | 
            -
                
         | 
| 50 | 
            -
                # config = AutoConfig.from_pretrained(model_name)
         | 
| 51 | 
            -
             | 
| 52 | 
            -
                # # Retrieve the number of parameters from the configuration
         | 
| 53 | 
            -
                # try:
         | 
| 54 | 
            -
                #     num_params = config.n_parameters
         | 
| 55 | 
            -
                # except AttributeError:
         | 
| 56 | 
            -
                #     print(f"Error: The number of parameters is not available in the config for the model '{model_name}'.")
         | 
| 57 | 
            -
                #     return None
         | 
| 58 | 
            -
             | 
| 59 | 
            -
                # return num_params
         | 
| 60 | 
            -
                
         | 
| 61 | 
            -
                now = datetime.datetime.now()
         | 
| 62 | 
            -
                time_string = now.strftime("%Y-%m-%d %H:%M:%S")
         | 
| 63 | 
            -
                return time_string
         | 
| 64 |  | 
| 65 | 
            -
            COLS = ["eval_name", | 
| 66 | 
            -
            TYPES = ["str", | 
| 67 |  | 
| 68 | 
            -
            EVAL_COLS = ["model", | 
| 69 | 
            -
            EVAL_TYPES = ["markdown","str", | 
| 70 | 
             
            def get_leaderboard():
         | 
| 71 | 
             
                if repo: 
         | 
| 72 | 
             
                    print("pulling changes")
         | 
| 73 | 
             
                    repo.git_pull()
         | 
| 74 | 
            -
                # entries = [entry for entry in os.listdir("evals") if not (entry.startswith('.') or entry=="eval_requests" or entry=="evals")] 
         | 
| 75 | 
            -
                # model_directories = [entry for entry in entries if os.path.isdir(os.path.join("evals", entry))]
         | 
| 76 | 
            -
                # all_data = []
         | 
| 77 | 
            -
                # for model in model_directories:
         | 
| 78 | 
            -
                #     model_data = {"base_model": None, "eval_name": model}
         | 
| 79 | 
            -
                    
         | 
| 80 | 
            -
                #     for benchmark, metric in zip(BENCHMARKS, METRICS):
         | 
| 81 | 
            -
                #         value, base_model = load_results(model, benchmark, metric)        
         | 
| 82 | 
            -
                #         model_data[BENCH_TO_NAME[benchmark]] = round(value,3)
         | 
| 83 | 
            -
                #         if base_model is not None: # in case the last benchmark failed
         | 
| 84 | 
            -
                #             model_data["base_model"] = base_model
         | 
| 85 | 
            -
                        
         | 
| 86 | 
            -
                #     model_data["total ⬆️"] = round(sum(model_data[benchmark] for benchmark in BENCH_TO_NAME.values()),3)
         | 
| 87 | 
            -
                    
         | 
| 88 | 
            -
                #     if model_data["base_model"] is not None:
         | 
| 89 | 
            -
                #         model_data["base_model"] = make_clickable_model(model_data["base_model"])
         | 
| 90 | 
            -
                    
         | 
| 91 | 
            -
                #     model_data["# params"] = get_n_params(model_data["base_model"])
         | 
| 92 | 
            -
                    
         | 
| 93 | 
            -
                #     if model_data["base_model"] is not None:
         | 
| 94 | 
            -
                #         all_data.append(model_data)
         | 
| 95 |  | 
| 96 | 
             
                all_data = get_eval_results_dicts()
         | 
| 97 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
| @@ -116,6 +79,7 @@ def get_eval_table(): | |
| 116 |  | 
| 117 | 
             
                        data["# params"] = get_n_params(data["model"])
         | 
| 118 | 
             
                        data["model"] = make_clickable_model(data["model"])
         | 
|  | |
| 119 |  | 
| 120 |  | 
| 121 | 
             
                        all_evals.append(data)
         | 
| @@ -127,7 +91,7 @@ def get_eval_table(): | |
| 127 | 
             
                            with open(file_path) as fp:
         | 
| 128 | 
             
                                data = json.load(fp)
         | 
| 129 |  | 
| 130 | 
            -
                            data["# params"] = get_n_params(data["model"])
         | 
| 131 | 
             
                            data["model"] = make_clickable_model(data["model"])
         | 
| 132 | 
             
                            all_evals.append(data)
         | 
| 133 |  | 
| @@ -139,9 +103,9 @@ def get_eval_table(): | |
| 139 | 
             
            leaderboard = get_leaderboard()
         | 
| 140 | 
             
            eval_queue = get_eval_table()
         | 
| 141 |  | 
| 142 | 
            -
            def is_model_on_hub(model_name) -> bool:
         | 
| 143 | 
             
                try:
         | 
| 144 | 
            -
                    config = AutoConfig.from_pretrained(model_name)
         | 
| 145 | 
             
                    return True
         | 
| 146 |  | 
| 147 | 
             
                except Exception as e:
         | 
| @@ -151,15 +115,19 @@ def is_model_on_hub(model_name) -> bool: | |
| 151 |  | 
| 152 |  | 
| 153 |  | 
| 154 | 
            -
            def add_new_eval(model:str, private:bool, is_8_bit_eval: bool, is_delta_weight:bool):
         | 
| 155 | 
             
                # check the model actually exists before adding the eval
         | 
| 156 | 
            -
                if  | 
|  | |
|  | |
|  | |
| 157 | 
             
                    print(model, "not found on hub")
         | 
| 158 | 
             
                    return
         | 
| 159 | 
             
                print("adding new eval")
         | 
| 160 |  | 
| 161 | 
             
                eval_entry = {
         | 
| 162 | 
             
                    "model" : model,
         | 
|  | |
| 163 | 
             
                    "private" : private,
         | 
| 164 | 
             
                    "8bit_eval" : is_8_bit_eval,
         | 
| 165 | 
             
                    "is_delta_weight" : is_delta_weight,
         | 
| @@ -227,14 +195,17 @@ with block: | |
| 227 | 
             
                    # with gr.Row():
         | 
| 228 | 
             
                    #     gr.Markdown(f"""# Submit a new model for evaluation""")
         | 
| 229 | 
             
                    with gr.Row():
         | 
| 230 | 
            -
                         | 
| 231 | 
            -
             | 
| 232 | 
            -
             | 
| 233 | 
            -
                         | 
|  | |
|  | |
|  | |
| 234 |  | 
| 235 | 
             
                    with gr.Row():
         | 
| 236 | 
             
                        submit_button = gr.Button("Submit Eval")
         | 
| 237 | 
            -
                        submit_button.click(add_new_eval, [model_name_textbox, is_8bit_toggle, private, is_delta_weight])
         | 
| 238 |  | 
| 239 |  | 
| 240 |  | 
|  | |
| 8 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 9 | 
             
            import pandas as pd
         | 
| 10 | 
             
            import datetime
         | 
| 11 | 
            +
            from utils import get_eval_results_dicts, make_clickable_model, get_n_params
         | 
| 12 |  | 
| 13 | 
             
            # clone / pull the lmeh eval data
         | 
| 14 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
|  | |
| 45 | 
             
                mean_acc = np.mean(accs)  
         | 
| 46 | 
             
                return mean_acc, data["config"]["model_args"]
         | 
| 47 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 48 |  | 
| 49 | 
            +
            COLS = ["eval_name",  "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️", "base_model"]
         | 
| 50 | 
            +
            TYPES = ["str",  "number", "number", "number", "number", "number","markdown", ]
         | 
| 51 |  | 
| 52 | 
            +
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 53 | 
            +
            EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
         | 
| 54 | 
             
            def get_leaderboard():
         | 
| 55 | 
             
                if repo: 
         | 
| 56 | 
             
                    print("pulling changes")
         | 
| 57 | 
             
                    repo.git_pull()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 58 |  | 
| 59 | 
             
                all_data = get_eval_results_dicts()
         | 
| 60 | 
             
                dataframe = pd.DataFrame.from_records(all_data)
         | 
|  | |
| 79 |  | 
| 80 | 
             
                        data["# params"] = get_n_params(data["model"])
         | 
| 81 | 
             
                        data["model"] = make_clickable_model(data["model"])
         | 
| 82 | 
            +
                        data["revision"] = data.get("revision", "main")
         | 
| 83 |  | 
| 84 |  | 
| 85 | 
             
                        all_evals.append(data)
         | 
|  | |
| 91 | 
             
                            with open(file_path) as fp:
         | 
| 92 | 
             
                                data = json.load(fp)
         | 
| 93 |  | 
| 94 | 
            +
                            #data["# params"] = get_n_params(data["model"])
         | 
| 95 | 
             
                            data["model"] = make_clickable_model(data["model"])
         | 
| 96 | 
             
                            all_evals.append(data)
         | 
| 97 |  | 
|  | |
| 103 | 
             
            leaderboard = get_leaderboard()
         | 
| 104 | 
             
            eval_queue = get_eval_table()
         | 
| 105 |  | 
| 106 | 
            +
            def is_model_on_hub(model_name, revision) -> bool:
         | 
| 107 | 
             
                try:
         | 
| 108 | 
            +
                    config = AutoConfig.from_pretrained(model_name, revision=revision)
         | 
| 109 | 
             
                    return True
         | 
| 110 |  | 
| 111 | 
             
                except Exception as e:
         | 
|  | |
| 115 |  | 
| 116 |  | 
| 117 |  | 
| 118 | 
            +
            def add_new_eval(model:str, revision:str, private:bool, is_8_bit_eval: bool, is_delta_weight:bool):
         | 
| 119 | 
             
                # check the model actually exists before adding the eval
         | 
| 120 | 
            +
                if revision == "":
         | 
| 121 | 
            +
                    revision = "main"
         | 
| 122 | 
            +
                print("revision", revision)
         | 
| 123 | 
            +
                if not is_model_on_hub(model, revision):
         | 
| 124 | 
             
                    print(model, "not found on hub")
         | 
| 125 | 
             
                    return
         | 
| 126 | 
             
                print("adding new eval")
         | 
| 127 |  | 
| 128 | 
             
                eval_entry = {
         | 
| 129 | 
             
                    "model" : model,
         | 
| 130 | 
            +
                    "revision" : revision,
         | 
| 131 | 
             
                    "private" : private,
         | 
| 132 | 
             
                    "8bit_eval" : is_8_bit_eval,
         | 
| 133 | 
             
                    "is_delta_weight" : is_delta_weight,
         | 
|  | |
| 195 | 
             
                    # with gr.Row():
         | 
| 196 | 
             
                    #     gr.Markdown(f"""# Submit a new model for evaluation""")
         | 
| 197 | 
             
                    with gr.Row():
         | 
| 198 | 
            +
                        with gr.Column():
         | 
| 199 | 
            +
                            model_name_textbox = gr.Textbox(label="Model name")
         | 
| 200 | 
            +
                            revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
         | 
| 201 | 
            +
                        with gr.Column():
         | 
| 202 | 
            +
                            is_8bit_toggle = gr.Checkbox(False, label="8 bit eval")
         | 
| 203 | 
            +
                            private = gr.Checkbox(False, label="Private")
         | 
| 204 | 
            +
                            is_delta_weight = gr.Checkbox(False, label="Delta weights")
         | 
| 205 |  | 
| 206 | 
             
                    with gr.Row():
         | 
| 207 | 
             
                        submit_button = gr.Button("Submit Eval")
         | 
| 208 | 
            +
                        submit_button.click(add_new_eval, [model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
         | 
| 209 |  | 
| 210 |  | 
| 211 |  | 
    	
        utils.py
    CHANGED
    
    | @@ -3,7 +3,7 @@ import shutil | |
| 3 | 
             
            import numpy as np
         | 
| 4 | 
             
            import gradio as gr
         | 
| 5 | 
             
            from huggingface_hub import Repository, HfApi
         | 
| 6 | 
            -
            from transformers import AutoConfig
         | 
| 7 | 
             
            import json
         | 
| 8 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 9 | 
             
            import pandas as pd
         | 
| @@ -15,18 +15,6 @@ from typing import List, Tuple, Dict | |
| 15 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| 16 | 
             
            LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 17 |  | 
| 18 | 
            -
            # repo=None
         | 
| 19 | 
            -
            # if H4_TOKEN:
         | 
| 20 | 
            -
            #     print("pulling repo")
         | 
| 21 | 
            -
            #     # try:
         | 
| 22 | 
            -
            #     #     shutil.rmtree("./evals/")
         | 
| 23 | 
            -
            #     # except:
         | 
| 24 | 
            -
            #     #     pass
         | 
| 25 | 
            -
             | 
| 26 | 
            -
            #     repo = Repository(
         | 
| 27 | 
            -
            #         local_dir="./evals/", clone_from=LMEH_REPO, use_auth_token=H4_TOKEN, repo_type="dataset"
         | 
| 28 | 
            -
            #     )
         | 
| 29 | 
            -
            #     repo.git_pull()
         | 
| 30 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 31 | 
             
            BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
         | 
| 32 | 
             
            BENCH_TO_NAME = {
         | 
| @@ -42,6 +30,21 @@ def make_clickable_model(model_name): | |
| 42 | 
             
                link = "https://huggingface.co/" + model_name
         | 
| 43 | 
             
                return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 44 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 45 | 
             
            @dataclass
         | 
| 46 | 
             
            class EvalResult:
         | 
| 47 | 
             
                eval_name : str
         | 
| @@ -50,12 +53,17 @@ class EvalResult: | |
| 50 | 
             
                is_8bit : bool
         | 
| 51 | 
             
                results : dict
         | 
| 52 |  | 
| 53 | 
            -
                def to_dict(self): | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 54 | 
             
                    data_dict = {}
         | 
| 55 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
| 56 | 
            -
                    data_dict["base_model"] = make_clickable_model( | 
| 57 | 
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         | 
| 58 | 
            -
                    data_dict["# params"] =  | 
| 59 |  | 
| 60 | 
             
                    for benchmark in BENCHMARKS:
         | 
| 61 | 
             
                        if not benchmark in self.results.keys():
         | 
|  | |
| 3 | 
             
            import numpy as np
         | 
| 4 | 
             
            import gradio as gr
         | 
| 5 | 
             
            from huggingface_hub import Repository, HfApi
         | 
| 6 | 
            +
            from transformers import AutoConfig, AutoModel
         | 
| 7 | 
             
            import json
         | 
| 8 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 9 | 
             
            import pandas as pd
         | 
|  | |
| 15 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| 16 | 
             
            LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 17 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 18 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 19 | 
             
            BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
         | 
| 20 | 
             
            BENCH_TO_NAME = {
         | 
|  | |
| 30 | 
             
                link = "https://huggingface.co/" + model_name
         | 
| 31 | 
             
                return f'<a target="_blank" href="{link}" style="color: blue; text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 32 |  | 
| 33 | 
            +
            def get_n_params(base_model):
         | 
| 34 | 
            +
                return "unknown"
         | 
| 35 | 
            +
                
         | 
| 36 | 
            +
                # WARNING: High memory usage
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                # Retrieve the number of parameters from the configuration
         | 
| 39 | 
            +
                try:
         | 
| 40 | 
            +
                    config = AutoConfig.from_pretrained(base_model, use_auth_token=True, low_cpu_mem_usage=True)
         | 
| 41 | 
            +
                    n_params = AutoModel.from_config(config).num_parameters()
         | 
| 42 | 
            +
                except Exception as e:
         | 
| 43 | 
            +
                    print(f"Error:{e} The number of parameters is not available in the config for the model '{base_model}'.")
         | 
| 44 | 
            +
                    return "unknown"
         | 
| 45 | 
            +
             | 
| 46 | 
            +
                return str(n_params)
         | 
| 47 | 
            +
             | 
| 48 | 
             
            @dataclass
         | 
| 49 | 
             
            class EvalResult:
         | 
| 50 | 
             
                eval_name : str
         | 
|  | |
| 53 | 
             
                is_8bit : bool
         | 
| 54 | 
             
                results : dict
         | 
| 55 |  | 
| 56 | 
            +
                def to_dict(self):
         | 
| 57 | 
            +
                    
         | 
| 58 | 
            +
                    if self.org is not None:
         | 
| 59 | 
            +
                        base_model =f"{self.org}/{self.model}"
         | 
| 60 | 
            +
                    else:
         | 
| 61 | 
            +
                        base_model =f"{self.model}"
         | 
| 62 | 
             
                    data_dict = {}
         | 
| 63 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
| 64 | 
            +
                    data_dict["base_model"] = make_clickable_model(base_model)
         | 
| 65 | 
             
                    data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
         | 
| 66 | 
            +
                    data_dict["# params"] = get_n_params(base_model)
         | 
| 67 |  | 
| 68 | 
             
                    for benchmark in BENCHMARKS:
         | 
| 69 | 
             
                        if not benchmark in self.results.keys():
         | 
 
			
