Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	rename the github link
Browse files- ZeroEval-main/result_dirs/zebra-grid.summary.json +44 -0
 - _about_us.md +1 -1
 - _header.md +1 -1
 - app.py +2 -2
 - constants.py +1 -1
 - data_utils.py +1 -1
 - update_data.sh +3 -3
 
    	
        ZeroEval-main/result_dirs/zebra-grid.summary.json
    CHANGED
    
    | 
         @@ -175,6 +175,17 @@ 
     | 
|
| 175 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 176 | 
         
             
                "Reason Lens": "855.72"
         
     | 
| 177 | 
         
             
              },
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 178 | 
         
             
              {
         
     | 
| 179 | 
         
             
                "Model": "gpt-4-turbo-2024-04-09",
         
     | 
| 180 | 
         
             
                "Mode": "sampling",
         
     | 
| 
         @@ -186,6 +197,17 @@ 
     | 
|
| 186 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 187 | 
         
             
                "Reason Lens": "1165.90"
         
     | 
| 188 | 
         
             
              },
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 189 | 
         
             
              {
         
     | 
| 190 | 
         
             
                "Model": "gemini-1.5-pro-exp-0801",
         
     | 
| 191 | 
         
             
                "Mode": "greedy",
         
     | 
| 
         @@ -472,6 +494,17 @@ 
     | 
|
| 472 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 473 | 
         
             
                "Reason Lens": "849.84"
         
     | 
| 474 | 
         
             
              },
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 475 | 
         
             
              {
         
     | 
| 476 | 
         
             
                "Model": "Meta-Llama-3-8B-Instruct",
         
     | 
| 477 | 
         
             
                "Mode": "greedy",
         
     | 
| 
         @@ -604,6 +637,17 @@ 
     | 
|
| 604 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 605 | 
         
             
                "Reason Lens": "718.43"
         
     | 
| 606 | 
         
             
              },
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 607 | 
         
             
              {
         
     | 
| 608 | 
         
             
                "Model": "gemma-2-2b-it",
         
     | 
| 609 | 
         
             
                "Mode": "greedy",
         
     | 
| 
         | 
|
| 175 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 176 | 
         
             
                "Reason Lens": "855.72"
         
     | 
| 177 | 
         
             
              },
         
     | 
| 178 | 
         
            +
              {
         
     | 
| 179 | 
         
            +
                "Model": "Qwen2.5-72B-Instruct",
         
     | 
| 180 | 
         
            +
                "Mode": "greedy",
         
     | 
| 181 | 
         
            +
                "Puzzle Acc": "26.60",
         
     | 
| 182 | 
         
            +
                "Cell Acc": "40.92",
         
     | 
| 183 | 
         
            +
                "No answer": "11.90",
         
     | 
| 184 | 
         
            +
                "Easy Puzzle Acc": "76.43",
         
     | 
| 185 | 
         
            +
                "Hard Puzzle Acc": "7.22",
         
     | 
| 186 | 
         
            +
                "Total Puzzles": 1000,
         
     | 
| 187 | 
         
            +
                "Reason Lens": "1795.90"
         
     | 
| 188 | 
         
            +
              },
         
     | 
| 189 | 
         
             
              {
         
     | 
| 190 | 
         
             
                "Model": "gpt-4-turbo-2024-04-09",
         
     | 
| 191 | 
         
             
                "Mode": "sampling",
         
     | 
| 
         | 
|
| 197 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 198 | 
         
             
                "Reason Lens": "1165.90"
         
     | 
| 199 | 
         
             
              },
         
     | 
| 200 | 
         
            +
              {
         
     | 
| 201 | 
         
            +
                "Model": "Qwen2.5-32B-Instruct",
         
     | 
| 202 | 
         
            +
                "Mode": "greedy",
         
     | 
| 203 | 
         
            +
                "Puzzle Acc": "26.10",
         
     | 
| 204 | 
         
            +
                "Cell Acc": "43.39",
         
     | 
| 205 | 
         
            +
                "No answer": "6.30",
         
     | 
| 206 | 
         
            +
                "Easy Puzzle Acc": "77.50",
         
     | 
| 207 | 
         
            +
                "Hard Puzzle Acc": "6.11",
         
     | 
| 208 | 
         
            +
                "Total Puzzles": 1000,
         
     | 
| 209 | 
         
            +
                "Reason Lens": "1333.07"
         
     | 
| 210 | 
         
            +
              },
         
     | 
| 211 | 
         
             
              {
         
     | 
| 212 | 
         
             
                "Model": "gemini-1.5-pro-exp-0801",
         
     | 
| 213 | 
         
             
                "Mode": "greedy",
         
     | 
| 
         | 
|
| 494 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 495 | 
         
             
                "Reason Lens": "849.84"
         
     | 
| 496 | 
         
             
              },
         
     | 
| 497 | 
         
            +
              {
         
     | 
| 498 | 
         
            +
                "Model": "Qwen2.5-7B-Instruct",
         
     | 
| 499 | 
         
            +
                "Mode": "greedy",
         
     | 
| 500 | 
         
            +
                "Puzzle Acc": "12.00",
         
     | 
| 501 | 
         
            +
                "Cell Acc": "30.67",
         
     | 
| 502 | 
         
            +
                "No answer": "9.50",
         
     | 
| 503 | 
         
            +
                "Easy Puzzle Acc": "38.93",
         
     | 
| 504 | 
         
            +
                "Hard Puzzle Acc": "1.53",
         
     | 
| 505 | 
         
            +
                "Total Puzzles": 1000,
         
     | 
| 506 | 
         
            +
                "Reason Lens": "850.93"
         
     | 
| 507 | 
         
            +
              },
         
     | 
| 508 | 
         
             
              {
         
     | 
| 509 | 
         
             
                "Model": "Meta-Llama-3-8B-Instruct",
         
     | 
| 510 | 
         
             
                "Mode": "greedy",
         
     | 
| 
         | 
|
| 637 | 
         
             
                "Total Puzzles": 1000,
         
     | 
| 638 | 
         
             
                "Reason Lens": "718.43"
         
     | 
| 639 | 
         
             
              },
         
     | 
| 640 | 
         
            +
              {
         
     | 
| 641 | 
         
            +
                "Model": "Qwen2.5-3B-Instruct",
         
     | 
| 642 | 
         
            +
                "Mode": "greedy",
         
     | 
| 643 | 
         
            +
                "Puzzle Acc": "4.80",
         
     | 
| 644 | 
         
            +
                "Cell Acc": "11.44",
         
     | 
| 645 | 
         
            +
                "No answer": "56.70",
         
     | 
| 646 | 
         
            +
                "Easy Puzzle Acc": "17.14",
         
     | 
| 647 | 
         
            +
                "Hard Puzzle Acc": "0.00",
         
     | 
| 648 | 
         
            +
                "Total Puzzles": 1000,
         
     | 
| 649 | 
         
            +
                "Reason Lens": "906.58"
         
     | 
| 650 | 
         
            +
              },
         
     | 
| 651 | 
         
             
              {
         
     | 
| 652 | 
         
             
                "Model": "gemma-2-2b-it",
         
     | 
| 653 | 
         
             
                "Mode": "greedy",
         
     | 
    	
        _about_us.md
    CHANGED
    
    | 
         @@ -10,6 +10,6 @@ We are from [AllenAI](https://allenai.org/) (AI2), a non-profit research organiz 
     | 
|
| 10 | 
         
             
            ### Contact
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            Please contact us in the following ways:
         
     | 
| 13 | 
         
            -
            - Github Issues/PRs: [https://github.com/ 
     | 
| 14 | 
         
             
            - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
         
     | 
| 15 | 
         | 
| 
         | 
|
| 10 | 
         
             
            ### Contact
         
     | 
| 11 | 
         | 
| 12 | 
         
             
            Please contact us in the following ways:
         
     | 
| 13 | 
         
            +
            - Github Issues/PRs: [https://github.com/WildEval/ZeroEval/](https://github.com/WildEval/ZeroEval/) 
         
     | 
| 14 | 
         
             
            - Other questions: Please contact Yuchen with email: yuchenl[at]allenai[dot]org
         
     | 
| 15 | 
         | 
    	
        _header.md
    CHANGED
    
    | 
         @@ -2,5 +2,5 @@ 
     | 
|
| 2 | 
         | 
| 3 | 
         
             
            # π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
         
     | 
| 4 | 
         
             
            <!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         
     | 
| 5 | 
         
            -
            [π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/ 
     | 
| 6 | 
         | 
| 
         | 
|
| 2 | 
         | 
| 3 | 
         
             
            # π¦ ZebraLogic: Benchmarking the Logical Reasoning Ability of Language Models
         
     | 
| 4 | 
         
             
            <!-- [π FnF Paper](https://arxiv.org/abs/2305.18654) |  -->
         
     | 
| 5 | 
         
            +
            [π° Blog](https://huggingface.co/blog/yuchenlin/zebra-logic) [π» GitHub](https://github.com/WildEval/ZeroEval) | [π€ HuggingFace](https://huggingface.co/collections/allenai/zebra-logic-bench-6697137cbaad0b91e635e7b0) | [π¦ X](https://twitter.com/billyuchenlin/) | [π¬ Discussion](https://huggingface.co/spaces/allenai/ZebraLogicBench-Leaderboard/discussions) | Updated: **{LAST_UPDATED}**
         
     | 
| 6 | 
         | 
    	
        app.py
    CHANGED
    
    | 
         @@ -135,8 +135,8 @@ def _tab_explore(): 
     | 
|
| 135 | 
         | 
| 136 | 
         
             
            def _tab_submit():
         
     | 
| 137 | 
         
             
                markdown_text = """
         
     | 
| 138 | 
         
            -
                Please create an issue on our [Github](https://github.com/ 
     | 
| 139 | 
         
            -
                If you would like to do local testing, please read our code [here](https://github.com/ 
     | 
| 140 | 
         
             
                and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
         
     | 
| 141 | 
         
             
                """
         
     | 
| 142 | 
         | 
| 
         | 
|
| 135 | 
         | 
| 136 | 
         
             
            def _tab_submit():
         
     | 
| 137 | 
         
             
                markdown_text = """
         
     | 
| 138 | 
         
            +
                Please create an issue on our [Github](https://github.com/WildEval/ZeroEval/) repository to talk about your model. Then, we can test it for you and report the results here on the Leaderboard.
         
     | 
| 139 | 
         
            +
                If you would like to do local testing, please read our code [here](https://github.com/WildEval/ZeroEval/blob/main/src/evaluation/zebra_grid_eval.py) 
         
     | 
| 140 | 
         
             
                and apply for the access for the [private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) that contains the truth solutions.
         
     | 
| 141 | 
         
             
                """
         
     | 
| 142 | 
         | 
    	
        constants.py
    CHANGED
    
    | 
         @@ -4,7 +4,7 @@ from collections import OrderedDict 
     | 
|
| 4 | 
         
             
            DEFAULT_K = "β"
         
     | 
| 5 | 
         
             
            # DEFAULT_K = "1500"
         
     | 
| 6 | 
         | 
| 7 | 
         
            -
            banner_url = "https://github.com/ 
     | 
| 8 | 
         
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
         
     | 
| 
         | 
|
| 4 | 
         
             
            DEFAULT_K = "β"
         
     | 
| 5 | 
         
             
            # DEFAULT_K = "1500"
         
     | 
| 6 | 
         | 
| 7 | 
         
            +
            banner_url = "https://github.com/WildEval/ZeroEval/blob/main/docs/zebra/zebra_banner.png?raw=true" # the same repo here.
         
     | 
| 8 | 
         
             
            BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 70vw; min-width: 300px; max-width: 1000px;border: 3px solid gray; border-color: gray black;"> </div>'
         
     | 
| 9 | 
         | 
| 10 | 
         
             
            # TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
         
     | 
    	
        data_utils.py
    CHANGED
    
    | 
         @@ -49,7 +49,7 @@ def load_all_data(): 
     | 
|
| 49 | 
         
             
                    model_summary = json.load(f)
         
     | 
| 50 | 
         
             
                model_names = [model["Model"] for model in model_summary]
         
     | 
| 51 | 
         
             
                for model_name in model_names:
         
     | 
| 52 | 
         
            -
                    download_url = f"https://raw.githubusercontent.com/ 
     | 
| 53 | 
         
             
                    output_file = os.path.join(result_dir, f"{model_name}.json")
         
     | 
| 54 | 
         
             
                    # mkdir -p result_dir if not exists 
         
     | 
| 55 | 
         
             
                    os.makedirs(result_dir, exist_ok=True)
         
     | 
| 
         | 
|
| 49 | 
         
             
                    model_summary = json.load(f)
         
     | 
| 50 | 
         
             
                model_names = [model["Model"] for model in model_summary]
         
     | 
| 51 | 
         
             
                for model_name in model_names:
         
     | 
| 52 | 
         
            +
                    download_url = f"https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/{model_name}.json"
         
     | 
| 53 | 
         
             
                    output_file = os.path.join(result_dir, f"{model_name}.json")
         
     | 
| 54 | 
         
             
                    # mkdir -p result_dir if not exists 
         
     | 
| 55 | 
         
             
                    os.makedirs(result_dir, exist_ok=True)
         
     | 
    	
        update_data.sh
    CHANGED
    
    | 
         @@ -1,5 +1,5 @@ 
     | 
|
| 1 | 
         
            -
            # download the file from https://raw.githubusercontent.com/ 
     | 
| 2 | 
         
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 3 | 
         
             
            mkdir -p ZeroEval-main/result_dirs/zebra-grid/
         
     | 
| 4 | 
         
            -
            wget https://raw.githubusercontent.com/ 
     | 
| 5 | 
         
            -
            wget https://raw.githubusercontent.com/ 
     | 
| 
         | 
|
| 1 | 
         
            +
            # download the file from https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json
         
     | 
| 2 | 
         
             
            # and put it to ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 3 | 
         
             
            mkdir -p ZeroEval-main/result_dirs/zebra-grid/
         
     | 
| 4 | 
         
            +
            wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid.summary.json -O ZeroEval-main/result_dirs/zebra-grid.summary.json
         
     | 
| 5 | 
         
            +
            wget https://raw.githubusercontent.com/WildEval/ZeroEval/main/result_dirs/zebra-grid/deepseek-chat.json -O ZeroEval-main/result_dirs/zebra-grid/deepseek-chat.json
         
     |