Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Fix elo ratings model links
Browse files- app.py +1 -1
- elo_utils.py +59 -11
    	
        app.py
    CHANGED
    
    | @@ -205,7 +205,7 @@ def get_leaderboard_df(): | |
| 205 | 
             
            def get_evaluation_queue_df():
         | 
| 206 | 
             
                if repo:
         | 
| 207 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 208 | 
            -
                     | 
| 209 |  | 
| 210 | 
             
                entries = [
         | 
| 211 | 
             
                    entry
         | 
|  | |
| 205 | 
             
            def get_evaluation_queue_df():
         | 
| 206 | 
             
                if repo:
         | 
| 207 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 208 | 
            +
                    repo.git_pull()
         | 
| 209 |  | 
| 210 | 
             
                entries = [
         | 
| 211 | 
             
                    entry
         | 
    	
        elo_utils.py
    CHANGED
    
    | @@ -8,10 +8,37 @@ from datasets import load_dataset | |
| 8 |  | 
| 9 | 
             
            from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
         | 
| 10 | 
             
            from utils import make_clickable_model
         | 
| 11 | 
            -
            from visualizations import ( | 
| 12 | 
            -
             | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 15 |  | 
| 16 |  | 
| 17 | 
             
            @dataclass
         | 
| @@ -26,7 +53,7 @@ class EloEvalResult: | |
| 26 | 
             
                def to_dict(self):
         | 
| 27 | 
             
                    base_model = f"{self.model}"
         | 
| 28 | 
             
                    data_dict = {}
         | 
| 29 | 
            -
                    data_dict["Model"] =  | 
| 30 | 
             
                    data_dict["GPT-4 (all)"] = self.gpt_4_all
         | 
| 31 | 
             
                    data_dict["Human (all)"] = self.human_all
         | 
| 32 | 
             
                    data_dict["Human (instruct)"] = self.human_instruct
         | 
| @@ -61,7 +88,13 @@ def create_eval_df(df, tie_allowed): | |
| 61 | 
             
                    }
         | 
| 62 |  | 
| 63 | 
             
                    if tie_allowed:
         | 
| 64 | 
            -
                        response["win"] =  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 65 | 
             
                    else:
         | 
| 66 | 
             
                        response["win"] = "model_a" if response["rating"] < 5 else "model_b"
         | 
| 67 |  | 
| @@ -84,7 +117,13 @@ def create_eval_df_for_gpt(df, tie_allowed): | |
| 84 | 
             
                    }
         | 
| 85 |  | 
| 86 | 
             
                    if tie_allowed:
         | 
| 87 | 
            -
                        response["win"] =  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 88 | 
             
                    else:
         | 
| 89 | 
             
                        response["win"] = "model_a" if response["rating"] < 5 else "model_b"
         | 
| 90 |  | 
| @@ -124,13 +163,20 @@ def get_elo_results(df_instruct, df_code_instruct, tie_allowed): | |
| 124 | 
             
                df_all = pd.concat([df_instruct, df_code_instruct])
         | 
| 125 |  | 
| 126 | 
             
                df_gpt_4 = load_dataset(
         | 
| 127 | 
            -
                    "gpt_4_evals/data/", | 
|  | |
|  | |
| 128 | 
             
                ).to_pandas()
         | 
| 129 |  | 
| 130 | 
             
                dfs = [df_instruct, df_code_instruct, df_all]
         | 
| 131 | 
            -
                elo_ratings = [ | 
|  | |
|  | |
|  | |
| 132 |  | 
| 133 | 
            -
                gpt_4_elo_ratings = convert_rating_from_float_to_int( | 
|  | |
|  | |
| 134 | 
             
                elo_ratings.append(gpt_4_elo_ratings)
         | 
| 135 |  | 
| 136 | 
             
                results = [
         | 
| @@ -166,7 +212,9 @@ def get_elo_plots(df_instruct, df_code_instruct, tie_allowed): | |
| 166 |  | 
| 167 | 
             
                BOOTSTRAP_ROUNDS = 1000
         | 
| 168 | 
             
                if "bootstrap_elo_lu" not in globals():
         | 
| 169 | 
            -
                    bootstrap_elo_lu = get_bootstrap_result( | 
|  | |
|  | |
| 170 |  | 
| 171 | 
             
                plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
         | 
| 172 |  | 
|  | |
| 8 |  | 
| 9 | 
             
            from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
         | 
| 10 | 
             
            from utils import make_clickable_model
         | 
| 11 | 
            +
            from visualizations import (
         | 
| 12 | 
            +
                get_bootstrap_result,
         | 
| 13 | 
            +
                switch_model_a_b,
         | 
| 14 | 
            +
                visualize_battle_count,
         | 
| 15 | 
            +
                visualize_bootstrap_scores,
         | 
| 16 | 
            +
                visualize_pairwise_win_fraction,
         | 
| 17 | 
            +
                visualize_rating_count,
         | 
| 18 | 
            +
            )
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
         | 
| 22 | 
            +
            VICUNA_LINK = "https://huggingface.co/HuggingFaceH4/stable-vicuna-13b-2904"
         | 
| 23 | 
            +
            OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
         | 
| 24 | 
            +
            DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
         | 
| 25 | 
            +
            MODEL_PAGE = "https://huggingface.co/models"
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            def make_clickable_model_elo(model_name):
         | 
| 29 | 
            +
                link = ""
         | 
| 30 | 
            +
                if model_name == "dolly-12b":
         | 
| 31 | 
            +
                    link = DOLLY_LINK
         | 
| 32 | 
            +
                elif model_name == "vicuna-13b":
         | 
| 33 | 
            +
                    link = VICUNA_LINK
         | 
| 34 | 
            +
                elif model_name == "koala-13b":
         | 
| 35 | 
            +
                    link = KOALA_LINK
         | 
| 36 | 
            +
                elif model_name == "oasst-12b":
         | 
| 37 | 
            +
                    link = OASST_LINK
         | 
| 38 | 
            +
                else:
         | 
| 39 | 
            +
                    link = MODEL_PAGE
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 42 |  | 
| 43 |  | 
| 44 | 
             
            @dataclass
         | 
|  | |
| 53 | 
             
                def to_dict(self):
         | 
| 54 | 
             
                    base_model = f"{self.model}"
         | 
| 55 | 
             
                    data_dict = {}
         | 
| 56 | 
            +
                    data_dict["Model"] = make_clickable_model_elo(base_model)
         | 
| 57 | 
             
                    data_dict["GPT-4 (all)"] = self.gpt_4_all
         | 
| 58 | 
             
                    data_dict["Human (all)"] = self.human_all
         | 
| 59 | 
             
                    data_dict["Human (instruct)"] = self.human_instruct
         | 
|  | |
| 88 | 
             
                    }
         | 
| 89 |  | 
| 90 | 
             
                    if tie_allowed:
         | 
| 91 | 
            +
                        response["win"] = (
         | 
| 92 | 
            +
                            "model_a"
         | 
| 93 | 
            +
                            if response["rating"] < 4
         | 
| 94 | 
            +
                            else "model_b"
         | 
| 95 | 
            +
                            if response["rating"] > 5
         | 
| 96 | 
            +
                            else "tie"
         | 
| 97 | 
            +
                        )
         | 
| 98 | 
             
                    else:
         | 
| 99 | 
             
                        response["win"] = "model_a" if response["rating"] < 5 else "model_b"
         | 
| 100 |  | 
|  | |
| 117 | 
             
                    }
         | 
| 118 |  | 
| 119 | 
             
                    if tie_allowed:
         | 
| 120 | 
            +
                        response["win"] = (
         | 
| 121 | 
            +
                            "model_a"
         | 
| 122 | 
            +
                            if response["rating"] < 4
         | 
| 123 | 
            +
                            else "model_b"
         | 
| 124 | 
            +
                            if response["rating"] > 5
         | 
| 125 | 
            +
                            else "tie"
         | 
| 126 | 
            +
                        )
         | 
| 127 | 
             
                    else:
         | 
| 128 | 
             
                        response["win"] = "model_a" if response["rating"] < 5 else "model_b"
         | 
| 129 |  | 
|  | |
| 163 | 
             
                df_all = pd.concat([df_instruct, df_code_instruct])
         | 
| 164 |  | 
| 165 | 
             
                df_gpt_4 = load_dataset(
         | 
| 166 | 
            +
                    "gpt_4_evals/data/",
         | 
| 167 | 
            +
                    split="train",
         | 
| 168 | 
            +
                    revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
         | 
| 169 | 
             
                ).to_pandas()
         | 
| 170 |  | 
| 171 | 
             
                dfs = [df_instruct, df_code_instruct, df_all]
         | 
| 172 | 
            +
                elo_ratings = [
         | 
| 173 | 
            +
                    convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
         | 
| 174 | 
            +
                    for df in dfs
         | 
| 175 | 
            +
                ]
         | 
| 176 |  | 
| 177 | 
            +
                gpt_4_elo_ratings = convert_rating_from_float_to_int(
         | 
| 178 | 
            +
                    create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
         | 
| 179 | 
            +
                )
         | 
| 180 | 
             
                elo_ratings.append(gpt_4_elo_ratings)
         | 
| 181 |  | 
| 182 | 
             
                results = [
         | 
|  | |
| 212 |  | 
| 213 | 
             
                BOOTSTRAP_ROUNDS = 1000
         | 
| 214 | 
             
                if "bootstrap_elo_lu" not in globals():
         | 
| 215 | 
            +
                    bootstrap_elo_lu = get_bootstrap_result(
         | 
| 216 | 
            +
                        game_switch, compute_elo, BOOTSTRAP_ROUNDS
         | 
| 217 | 
            +
                    )
         | 
| 218 |  | 
| 219 | 
             
                plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
         | 
| 220 |  | 
 
			
