Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -24,7 +24,7 @@ def make_leaderboard_md(elo_results): | |
| 24 | 
             
            - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
         | 
| 25 | 
             
            - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
         | 
| 26 |  | 
| 27 | 
            -
            💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10) | 
| 28 | 
             
            """
         | 
| 29 | 
             
                return leaderboard_md
         | 
| 30 |  | 
| @@ -173,7 +173,6 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): | |
| 173 | 
             
                        "Model",
         | 
| 174 | 
             
                        "Arena Elo rating",
         | 
| 175 | 
             
                        "MT-bench (score)",
         | 
| 176 | 
            -
                        "MT-bench (win rate %)",
         | 
| 177 | 
             
                        "MMLU",
         | 
| 178 | 
             
                        "License",
         | 
| 179 | 
             
                    ]
         | 
| @@ -191,7 +190,7 @@ def build_leaderboard_tab(elo_results_file, leaderboard_table_file): | |
| 191 |  | 
| 192 | 
             
                    gr.Dataframe(
         | 
| 193 | 
             
                        headers=headers,
         | 
| 194 | 
            -
                        datatype=["markdown", "number", "number", "number", " | 
| 195 | 
             
                        value=values,
         | 
| 196 | 
             
                        elem_id="leaderboard_dataframe",
         | 
| 197 | 
             
                    )
         | 
|  | |
| 24 | 
             
            - [MT-Bench](https://arxiv.org/abs/2306.05685) - a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
         | 
| 25 | 
             
            - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot) - a test to measure a model's multitask accuracy on 57 tasks.
         | 
| 26 |  | 
| 27 | 
            +
            💻 We use [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) to compute MT-bench scores (single-answer grading on a scale of 10). The Arena Elo ratings are computed by this [notebook]({notebook_url}). The MMLU scores are computed by [InstructEval](https://github.com/declare-lab/instruct-eval) and [Chain-of-Thought Hub](https://github.com/FranxYao/chain-of-thought-hub). Higher values are better for all benchmarks. Empty cells mean not available.
         | 
| 28 | 
             
            """
         | 
| 29 | 
             
                return leaderboard_md
         | 
| 30 |  | 
|  | |
| 173 | 
             
                        "Model",
         | 
| 174 | 
             
                        "Arena Elo rating",
         | 
| 175 | 
             
                        "MT-bench (score)",
         | 
|  | |
| 176 | 
             
                        "MMLU",
         | 
| 177 | 
             
                        "License",
         | 
| 178 | 
             
                    ]
         | 
|  | |
| 190 |  | 
| 191 | 
             
                    gr.Dataframe(
         | 
| 192 | 
             
                        headers=headers,
         | 
| 193 | 
            +
                        datatype=["markdown", "number", "number", "number", "str"],
         | 
| 194 | 
             
                        value=values,
         | 
| 195 | 
             
                        elem_id="leaderboard_dataframe",
         | 
| 196 | 
             
                    )
         | 
 
			

