Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	implements search bar
Browse files
    	
        app.py
    CHANGED
    
    | @@ -81,16 +81,9 @@ COLS = [ | |
| 81 | 
             
                "HellaSwag (10-shot) ⬆️",
         | 
| 82 | 
             
                "MMLU (5-shot) ⬆️",
         | 
| 83 | 
             
                "TruthfulQA (0-shot) ⬆️",
         | 
|  | |
| 84 | 
             
            ]
         | 
| 85 | 
            -
            TYPES = [
         | 
| 86 | 
            -
                "markdown",
         | 
| 87 | 
            -
                "str",
         | 
| 88 | 
            -
                "number",
         | 
| 89 | 
            -
                "number",
         | 
| 90 | 
            -
                "number",
         | 
| 91 | 
            -
                "number",
         | 
| 92 | 
            -
                "number",
         | 
| 93 | 
            -
            ]
         | 
| 94 |  | 
| 95 | 
             
            if not IS_PUBLIC:
         | 
| 96 | 
             
                COLS.insert(2, "8bit")
         | 
| @@ -115,7 +108,7 @@ def has_nan_values(df, columns): | |
| 115 | 
             
                return df[columns].isna().any(axis=1)
         | 
| 116 |  | 
| 117 |  | 
| 118 | 
            -
            def  | 
| 119 | 
             
                if repo:
         | 
| 120 | 
             
                    print("Pulling evaluation results for the leaderboard.")
         | 
| 121 | 
             
                    repo.git_pull()
         | 
| @@ -132,6 +125,7 @@ def get_leaderboard(): | |
| 132 | 
             
                        "HellaSwag (10-shot) ⬆️": 95.3,
         | 
| 133 | 
             
                        "MMLU (5-shot) ⬆️": 86.4,
         | 
| 134 | 
             
                        "TruthfulQA (0-shot) ⬆️": 59.0,
         | 
|  | |
| 135 | 
             
                    }
         | 
| 136 | 
             
                    all_data.append(gpt4_values)
         | 
| 137 | 
             
                    gpt35_values = {
         | 
| @@ -143,6 +137,7 @@ def get_leaderboard(): | |
| 143 | 
             
                        "HellaSwag (10-shot) ⬆️": 85.5,
         | 
| 144 | 
             
                        "MMLU (5-shot) ⬆️": 70.0,
         | 
| 145 | 
             
                        "TruthfulQA (0-shot) ⬆️": 47.0,
         | 
|  | |
| 146 | 
             
                    }
         | 
| 147 | 
             
                    all_data.append(gpt35_values)
         | 
| 148 |  | 
| @@ -155,6 +150,7 @@ def get_leaderboard(): | |
| 155 | 
             
                    "HellaSwag (10-shot) ⬆️": 25.0,
         | 
| 156 | 
             
                    "MMLU (5-shot) ⬆️": 25.0,
         | 
| 157 | 
             
                    "TruthfulQA (0-shot) ⬆️": 25.0,
         | 
|  | |
| 158 | 
             
                }
         | 
| 159 |  | 
| 160 | 
             
                all_data.append(base_line)
         | 
| @@ -168,7 +164,7 @@ def get_leaderboard(): | |
| 168 | 
             
                return df
         | 
| 169 |  | 
| 170 |  | 
| 171 | 
            -
            def  | 
| 172 | 
             
                if repo:
         | 
| 173 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 174 | 
             
                    repo.git_pull()
         | 
| @@ -216,8 +212,13 @@ def get_eval_table(): | |
| 216 | 
             
                return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
         | 
| 217 |  | 
| 218 |  | 
| 219 | 
            -
             | 
| 220 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 221 |  | 
| 222 |  | 
| 223 | 
             
            def is_model_on_hub(model_name, revision) -> bool:
         | 
| @@ -294,9 +295,18 @@ def add_new_eval( | |
| 294 |  | 
| 295 |  | 
| 296 | 
             
            def refresh():
         | 
| 297 | 
            -
                 | 
| 298 | 
            -
                 | 
| 299 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 300 |  | 
| 301 |  | 
| 302 | 
             
            custom_css = """
         | 
| @@ -324,8 +334,20 @@ custom_css = """ | |
| 324 | 
             
                margin: 6px;
         | 
| 325 | 
             
                transform: scale(1.3);
         | 
| 326 | 
             
            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 327 | 
             
            """
         | 
| 328 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 329 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
| 330 | 
             
            with demo:
         | 
| 331 | 
             
                gr.HTML(TITLE)
         | 
| @@ -343,22 +365,35 @@ with demo: | |
| 343 | 
             
                        with gr.Accordion("✨ CHANGELOG", open=False):
         | 
| 344 | 
             
                            changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
         | 
| 345 |  | 
|  | |
|  | |
| 346 | 
             
                leaderboard_table = gr.components.Dataframe(
         | 
| 347 | 
            -
                    value= | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 348 | 
             
                )
         | 
| 349 |  | 
| 350 | 
             
                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 351 |  | 
| 352 | 
             
                with gr.Accordion("✅ Finished Evaluations", open=False):
         | 
| 353 | 
             
                    finished_eval_table = gr.components.Dataframe(
         | 
| 354 | 
            -
                        value= | 
| 355 | 
             
                        headers=EVAL_COLS,
         | 
| 356 | 
             
                        datatype=EVAL_TYPES,
         | 
| 357 | 
             
                        max_rows=5,
         | 
| 358 | 
             
                    )
         | 
| 359 | 
             
                with gr.Accordion("🔄 Running Evaluation Queue", open=False):
         | 
| 360 | 
             
                    running_eval_table = gr.components.Dataframe(
         | 
| 361 | 
            -
                        value= | 
| 362 | 
             
                        headers=EVAL_COLS,
         | 
| 363 | 
             
                        datatype=EVAL_TYPES,
         | 
| 364 | 
             
                        max_rows=5,
         | 
| @@ -366,7 +401,7 @@ with demo: | |
| 366 |  | 
| 367 | 
             
                with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
         | 
| 368 | 
             
                    pending_eval_table = gr.components.Dataframe(
         | 
| 369 | 
            -
                        value= | 
| 370 | 
             
                        headers=EVAL_COLS,
         | 
| 371 | 
             
                        datatype=EVAL_TYPES,
         | 
| 372 | 
             
                        max_rows=5,
         | 
|  | |
| 81 | 
             
                "HellaSwag (10-shot) ⬆️",
         | 
| 82 | 
             
                "MMLU (5-shot) ⬆️",
         | 
| 83 | 
             
                "TruthfulQA (0-shot) ⬆️",
         | 
| 84 | 
            +
                "model_name_for_query",  # dummy column to implement search bar (hidden by custom CSS)
         | 
| 85 | 
             
            ]
         | 
| 86 | 
            +
            TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "str"]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 87 |  | 
| 88 | 
             
            if not IS_PUBLIC:
         | 
| 89 | 
             
                COLS.insert(2, "8bit")
         | 
|  | |
| 108 | 
             
                return df[columns].isna().any(axis=1)
         | 
| 109 |  | 
| 110 |  | 
| 111 | 
            +
            def get_leaderboard_df():
         | 
| 112 | 
             
                if repo:
         | 
| 113 | 
             
                    print("Pulling evaluation results for the leaderboard.")
         | 
| 114 | 
             
                    repo.git_pull()
         | 
|  | |
| 125 | 
             
                        "HellaSwag (10-shot) ⬆️": 95.3,
         | 
| 126 | 
             
                        "MMLU (5-shot) ⬆️": 86.4,
         | 
| 127 | 
             
                        "TruthfulQA (0-shot) ⬆️": 59.0,
         | 
| 128 | 
            +
                        "model_name_for_query": "GPT-4",
         | 
| 129 | 
             
                    }
         | 
| 130 | 
             
                    all_data.append(gpt4_values)
         | 
| 131 | 
             
                    gpt35_values = {
         | 
|  | |
| 137 | 
             
                        "HellaSwag (10-shot) ⬆️": 85.5,
         | 
| 138 | 
             
                        "MMLU (5-shot) ⬆️": 70.0,
         | 
| 139 | 
             
                        "TruthfulQA (0-shot) ⬆️": 47.0,
         | 
| 140 | 
            +
                        "model_name_for_query": "GPT-3.5",
         | 
| 141 | 
             
                    }
         | 
| 142 | 
             
                    all_data.append(gpt35_values)
         | 
| 143 |  | 
|  | |
| 150 | 
             
                    "HellaSwag (10-shot) ⬆️": 25.0,
         | 
| 151 | 
             
                    "MMLU (5-shot) ⬆️": 25.0,
         | 
| 152 | 
             
                    "TruthfulQA (0-shot) ⬆️": 25.0,
         | 
| 153 | 
            +
                    "model_name_for_query": "baseline",
         | 
| 154 | 
             
                }
         | 
| 155 |  | 
| 156 | 
             
                all_data.append(base_line)
         | 
|  | |
| 164 | 
             
                return df
         | 
| 165 |  | 
| 166 |  | 
| 167 | 
            +
            def get_evaluation_queue_df():
         | 
| 168 | 
             
                if repo:
         | 
| 169 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 170 | 
             
                    repo.git_pull()
         | 
|  | |
| 212 | 
             
                return df_finished[EVAL_COLS], df_running[EVAL_COLS], df_pending[EVAL_COLS]
         | 
| 213 |  | 
| 214 |  | 
| 215 | 
            +
            original_df = get_leaderboard_df()
         | 
| 216 | 
            +
            leaderboard_df = original_df.copy()
         | 
| 217 | 
            +
            (
         | 
| 218 | 
            +
                finished_eval_queue_df,
         | 
| 219 | 
            +
                running_eval_queue_df,
         | 
| 220 | 
            +
                pending_eval_queue_df,
         | 
| 221 | 
            +
            ) = get_evaluation_queue_df()
         | 
| 222 |  | 
| 223 |  | 
| 224 | 
             
            def is_model_on_hub(model_name, revision) -> bool:
         | 
|  | |
| 295 |  | 
| 296 |  | 
| 297 | 
             
            def refresh():
         | 
| 298 | 
            +
                leaderboard_df = get_leaderboard_df()
         | 
| 299 | 
            +
                (
         | 
| 300 | 
            +
                    finished_eval_queue_df,
         | 
| 301 | 
            +
                    running_eval_queue_df,
         | 
| 302 | 
            +
                    pending_eval_queue_df,
         | 
| 303 | 
            +
                ) = get_evaluation_queue_df()
         | 
| 304 | 
            +
                return (
         | 
| 305 | 
            +
                    leaderboard_df,
         | 
| 306 | 
            +
                    finished_eval_queue_df,
         | 
| 307 | 
            +
                    running_eval_queue_df,
         | 
| 308 | 
            +
                    pending_eval_queue_df,
         | 
| 309 | 
            +
                )
         | 
| 310 |  | 
| 311 |  | 
| 312 | 
             
            custom_css = """
         | 
|  | |
| 334 | 
             
                margin: 6px;
         | 
| 335 | 
             
                transform: scale(1.3);
         | 
| 336 | 
             
            }
         | 
| 337 | 
            +
             | 
| 338 | 
            +
            /* Hides the final column */
         | 
| 339 | 
            +
            table td:last-child,
         | 
| 340 | 
            +
            table th:last-child {
         | 
| 341 | 
            +
                display: none;
         | 
| 342 | 
            +
            }
         | 
| 343 | 
             
            """
         | 
| 344 |  | 
| 345 | 
            +
             | 
| 346 | 
            +
            def search_table(df, query):
         | 
| 347 | 
            +
                filtered_df = df[df["model_name_for_query"].str.contains(query, case=False)]
         | 
| 348 | 
            +
                return filtered_df
         | 
| 349 | 
            +
             | 
| 350 | 
            +
             | 
| 351 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
| 352 | 
             
            with demo:
         | 
| 353 | 
             
                gr.HTML(TITLE)
         | 
|  | |
| 365 | 
             
                        with gr.Accordion("✨ CHANGELOG", open=False):
         | 
| 366 | 
             
                            changelog = gr.Markdown(CHANGELOG_TEXT, elem_id="changelog-text")
         | 
| 367 |  | 
| 368 | 
            +
                search_bar = gr.Textbox(label="Search bar")
         | 
| 369 | 
            +
             | 
| 370 | 
             
                leaderboard_table = gr.components.Dataframe(
         | 
| 371 | 
            +
                    value=leaderboard_df, headers=COLS, datatype=TYPES, max_rows=5
         | 
| 372 | 
            +
                )
         | 
| 373 | 
            +
                
         | 
| 374 | 
            +
                # Dummy leaderboard for handling the case when the user uses backspace key
         | 
| 375 | 
            +
                hidden_leaderboard_table_for_search = gr.components.Dataframe(
         | 
| 376 | 
            +
                    value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
         | 
| 377 | 
            +
                )
         | 
| 378 | 
            +
             | 
| 379 | 
            +
                search_bar.change(
         | 
| 380 | 
            +
                    search_table,
         | 
| 381 | 
            +
                    [hidden_leaderboard_table_for_search, search_bar],
         | 
| 382 | 
            +
                    leaderboard_table,
         | 
| 383 | 
             
                )
         | 
| 384 |  | 
| 385 | 
             
                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 386 |  | 
| 387 | 
             
                with gr.Accordion("✅ Finished Evaluations", open=False):
         | 
| 388 | 
             
                    finished_eval_table = gr.components.Dataframe(
         | 
| 389 | 
            +
                        value=finished_eval_queue_df,
         | 
| 390 | 
             
                        headers=EVAL_COLS,
         | 
| 391 | 
             
                        datatype=EVAL_TYPES,
         | 
| 392 | 
             
                        max_rows=5,
         | 
| 393 | 
             
                    )
         | 
| 394 | 
             
                with gr.Accordion("🔄 Running Evaluation Queue", open=False):
         | 
| 395 | 
             
                    running_eval_table = gr.components.Dataframe(
         | 
| 396 | 
            +
                        value=running_eval_queue_df,
         | 
| 397 | 
             
                        headers=EVAL_COLS,
         | 
| 398 | 
             
                        datatype=EVAL_TYPES,
         | 
| 399 | 
             
                        max_rows=5,
         | 
|  | |
| 401 |  | 
| 402 | 
             
                with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
         | 
| 403 | 
             
                    pending_eval_table = gr.components.Dataframe(
         | 
| 404 | 
            +
                        value=pending_eval_queue_df,
         | 
| 405 | 
             
                        headers=EVAL_COLS,
         | 
| 406 | 
             
                        datatype=EVAL_TYPES,
         | 
| 407 | 
             
                        max_rows=5,
         | 
    	
        utils.py
    CHANGED
    
    | @@ -71,6 +71,8 @@ class EvalResult: | |
| 71 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
| 72 | 
             
                    data_dict["8bit"] = self.is_8bit
         | 
| 73 | 
             
                    data_dict["Model"] = make_clickable_model(base_model)
         | 
|  | |
|  | |
| 74 | 
             
                    data_dict["Revision"] = self.revision
         | 
| 75 | 
             
                    data_dict["Average ⬆️"] = round(
         | 
| 76 | 
             
                        sum([v for k, v in self.results.items()]) / 4.0, 1
         | 
|  | |
| 71 | 
             
                    data_dict["eval_name"] = self.eval_name
         | 
| 72 | 
             
                    data_dict["8bit"] = self.is_8bit
         | 
| 73 | 
             
                    data_dict["Model"] = make_clickable_model(base_model)
         | 
| 74 | 
            +
                    # dummy column to implement search bar (hidden by custom CSS)
         | 
| 75 | 
            +
                    data_dict["model_name_for_query"] = base_model
         | 
| 76 | 
             
                    data_dict["Revision"] = self.revision
         | 
| 77 | 
             
                    data_dict["Average ⬆️"] = round(
         | 
| 78 | 
             
                        sum([v for k, v in self.results.items()]) / 4.0, 1
         | 
 
			

