Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	fix-memory-requirements-for-cpu (#36)
Browse files- feat(cpu): fix the displaying of memory requirements for intel cpu (1fbcd42f775a19529dee237d0b6ee72e6c5ab8b2)
- app.py +4 -7
- hardware.yaml +1 -1
- src/llm_perf.py +27 -7
    	
        app.py
    CHANGED
    
    | @@ -67,13 +67,10 @@ with demo: | |
| 67 | 
             
                                    search_bar, columns_checkboxes, leaderboard_table = (
         | 
| 68 | 
             
                                        create_leaderboard_table(open_llm_perf_df)
         | 
| 69 | 
             
                                    )
         | 
| 70 | 
            -
                                 | 
| 71 | 
            -
                                     | 
| 72 | 
            -
             | 
| 73 | 
            -
                                     | 
| 74 | 
            -
                                        lat_score_mem_plot = create_lat_score_mem_plot(
         | 
| 75 | 
            -
                                            open_llm_perf_df
         | 
| 76 | 
            -
                                        )
         | 
| 77 | 
             
                                ###################### ATTENTIONS SPEEDUP TAB #######################
         | 
| 78 | 
             
                                # with gr.TabItem("Attention π", id=2):
         | 
| 79 | 
             
                                #     attn_prefill_plot, attn_decode_plot = create_attn_plots(
         | 
|  | |
| 67 | 
             
                                    search_bar, columns_checkboxes, leaderboard_table = (
         | 
| 68 | 
             
                                        create_leaderboard_table(open_llm_perf_df)
         | 
| 69 | 
             
                                    )
         | 
| 70 | 
            +
                                with gr.TabItem("Find Your Best Model π§", id=1):
         | 
| 71 | 
            +
                                    lat_score_mem_plot = create_lat_score_mem_plot(
         | 
| 72 | 
            +
                                        open_llm_perf_df
         | 
| 73 | 
            +
                                    )
         | 
|  | |
|  | |
|  | |
| 74 | 
             
                                ###################### ATTENTIONS SPEEDUP TAB #######################
         | 
| 75 | 
             
                                # with gr.TabItem("Attention π", id=2):
         | 
| 76 | 
             
                                #     attn_prefill_plot, attn_decode_plot = create_attn_plots(
         | 
    	
        hardware.yaml
    CHANGED
    
    | @@ -39,7 +39,7 @@ | |
| 39 | 
             
            - machine: 32vCPU-C7i
         | 
| 40 | 
             
              description: Intel-Xeon-SPR-385W π₯οΈ
         | 
| 41 | 
             
              detail: |
         | 
| 42 | 
            -
                We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark.
         | 
| 43 | 
             
              hardware_provider: intel
         | 
| 44 | 
             
              hardware_type: cpu
         | 
| 45 | 
             
              subsets:
         | 
|  | |
| 39 | 
             
            - machine: 32vCPU-C7i
         | 
| 40 | 
             
              description: Intel-Xeon-SPR-385W π₯οΈ
         | 
| 41 | 
             
              detail: |
         | 
| 42 | 
            +
                We tested the [32vCPU AWS C7i](https://aws.amazon.com/ec2/instance-types/c7i/) instance for the benchmark. The memory requirement is the max RAM consumption during the decode phase.
         | 
| 43 | 
             
              hardware_provider: intel
         | 
| 44 | 
             
              hardware_type: cpu
         | 
| 45 | 
             
              subsets:
         | 
    	
        src/llm_perf.py
    CHANGED
    
    | @@ -15,7 +15,6 @@ COLUMNS_MAPPING = { | |
| 15 | 
             
                "report.per_token.latency.p50": "Per Token (s)",
         | 
| 16 | 
             
                "report.decode.throughput.value": "Decode (tokens/s)",
         | 
| 17 | 
             
                "report.decode.efficiency.value": "Energy (tokens/kWh)",
         | 
| 18 | 
            -
                "report.decode.memory.max_allocated": "Memory (MB)",
         | 
| 19 | 
             
                # deployment settings
         | 
| 20 | 
             
                "config.backend.name": "Backend π",
         | 
| 21 | 
             
                "config.backend.torch_dtype": "Precision π₯",
         | 
| @@ -28,6 +27,15 @@ COLUMNS_MAPPING = { | |
| 28 | 
             
                "Average β¬οΈ": "Open LLM Score (%)",
         | 
| 29 | 
             
                "#Params (B)": "Params (B)",
         | 
| 30 | 
             
            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 31 | 
             
            SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
         | 
| 32 | 
             
            SORTING_ASCENDING = [False, True, False]
         | 
| 33 |  | 
| @@ -39,9 +47,10 @@ def get_raw_llm_perf_df( | |
| 39 | 
             
                for subset in subsets:
         | 
| 40 | 
             
                    for backend in backends:
         | 
| 41 | 
             
                        try:
         | 
|  | |
| 42 | 
             
                            dfs.append(
         | 
| 43 | 
             
                                pd.read_csv(
         | 
| 44 | 
            -
                                     | 
| 45 | 
             
                                )
         | 
| 46 | 
             
                            )
         | 
| 47 | 
             
                        except Exception:
         | 
| @@ -70,7 +79,7 @@ def get_raw_llm_perf_df( | |
| 70 | 
             
                return llm_perf_df
         | 
| 71 |  | 
| 72 |  | 
| 73 | 
            -
            def processed_llm_perf_df(llm_perf_df):
         | 
| 74 | 
             
                # some assertions
         | 
| 75 | 
             
                assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
         | 
| 76 | 
             
                assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
         | 
| @@ -105,15 +114,23 @@ def processed_llm_perf_df(llm_perf_df): | |
| 105 | 
             
                        "report.decode.throughput.value": 3,
         | 
| 106 | 
             
                        "report.decode.efficiency.value": 3,
         | 
| 107 | 
             
                        "report.decode.memory.max_allocated": 3,
         | 
|  | |
| 108 | 
             
                        "Average β¬οΈ": 3,
         | 
| 109 | 
             
                        "prefill+decode": 3,
         | 
| 110 | 
             
                        "#Params (B)": 3,
         | 
| 111 | 
             
                    }
         | 
| 112 | 
             
                )
         | 
|  | |
| 113 | 
             
                # filter columns
         | 
| 114 | 
            -
                 | 
| 115 | 
            -
             | 
| 116 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 117 | 
             
                # sort by metric
         | 
| 118 | 
             
                llm_perf_df.sort_values(
         | 
| 119 | 
             
                    by=SORTING_COLUMNS,
         | 
| @@ -121,6 +138,9 @@ def processed_llm_perf_df(llm_perf_df): | |
| 121 | 
             
                    inplace=True,
         | 
| 122 | 
             
                )
         | 
| 123 |  | 
|  | |
|  | |
|  | |
| 124 | 
             
                return llm_perf_df
         | 
| 125 |  | 
| 126 |  | 
| @@ -137,7 +157,7 @@ def get_llm_perf_df( | |
| 137 | 
             
                else:
         | 
| 138 | 
             
                    print(f"Dataset machine {machine} not found, downloading...")
         | 
| 139 | 
             
                    llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
         | 
| 140 | 
            -
                    llm_perf_df = processed_llm_perf_df(llm_perf_df)
         | 
| 141 | 
             
                    llm_perf_df.to_csv(
         | 
| 142 | 
             
                        f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
         | 
| 143 | 
             
                    )
         | 
|  | |
| 15 | 
             
                "report.per_token.latency.p50": "Per Token (s)",
         | 
| 16 | 
             
                "report.decode.throughput.value": "Decode (tokens/s)",
         | 
| 17 | 
             
                "report.decode.efficiency.value": "Energy (tokens/kWh)",
         | 
|  | |
| 18 | 
             
                # deployment settings
         | 
| 19 | 
             
                "config.backend.name": "Backend π",
         | 
| 20 | 
             
                "config.backend.torch_dtype": "Precision π₯",
         | 
|  | |
| 27 | 
             
                "Average β¬οΈ": "Open LLM Score (%)",
         | 
| 28 | 
             
                "#Params (B)": "Params (B)",
         | 
| 29 | 
             
            }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            CUDA_COLUMNS_MAPPING = COLUMNS_MAPPING | {
         | 
| 32 | 
            +
                "report.decode.memory.max_allocated": "Memory (MB)",
         | 
| 33 | 
            +
            }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            INTEL_COLUMNS_MAPPING = COLUMNS_MAPPING | {
         | 
| 36 | 
            +
                "report.decode.memory.max_ram": "Memory (MB)",
         | 
| 37 | 
            +
            }
         | 
| 38 | 
            +
             | 
| 39 | 
             
            SORTING_COLUMNS = ["Open LLM Score (%)", "Decode (tokens/s)", "Prefill (s)"]
         | 
| 40 | 
             
            SORTING_ASCENDING = [False, True, False]
         | 
| 41 |  | 
|  | |
| 47 | 
             
                for subset in subsets:
         | 
| 48 | 
             
                    for backend in backends:
         | 
| 49 | 
             
                        try:
         | 
| 50 | 
            +
                            url = f"hf://datasets/optimum-benchmark/llm-perf-leaderboard/perf-df-{backend}-{hardware_type}-{subset}-{machine}.csv"
         | 
| 51 | 
             
                            dfs.append(
         | 
| 52 | 
             
                                pd.read_csv(
         | 
| 53 | 
            +
                                    url
         | 
| 54 | 
             
                                )
         | 
| 55 | 
             
                            )
         | 
| 56 | 
             
                        except Exception:
         | 
|  | |
| 79 | 
             
                return llm_perf_df
         | 
| 80 |  | 
| 81 |  | 
| 82 | 
            +
            def processed_llm_perf_df(llm_perf_df, hardware_type: str):
         | 
| 83 | 
             
                # some assertions
         | 
| 84 | 
             
                assert llm_perf_df["config.scenario.input_shapes.batch_size"].nunique() == 1
         | 
| 85 | 
             
                assert llm_perf_df["config.scenario.input_shapes.sequence_length"].nunique() == 1
         | 
|  | |
| 114 | 
             
                        "report.decode.throughput.value": 3,
         | 
| 115 | 
             
                        "report.decode.efficiency.value": 3,
         | 
| 116 | 
             
                        "report.decode.memory.max_allocated": 3,
         | 
| 117 | 
            +
                        "report.decode.memory.max_ram": 3,
         | 
| 118 | 
             
                        "Average β¬οΈ": 3,
         | 
| 119 | 
             
                        "prefill+decode": 3,
         | 
| 120 | 
             
                        "#Params (B)": 3,
         | 
| 121 | 
             
                    }
         | 
| 122 | 
             
                )
         | 
| 123 | 
            +
             | 
| 124 | 
             
                # filter columns
         | 
| 125 | 
            +
                if hardware_type == "cuda":
         | 
| 126 | 
            +
                    llm_perf_df = llm_perf_df[list(CUDA_COLUMNS_MAPPING.keys())]
         | 
| 127 | 
            +
                    llm_perf_df.rename(columns=CUDA_COLUMNS_MAPPING, inplace=True)
         | 
| 128 | 
            +
                elif hardware_type == "cpu":
         | 
| 129 | 
            +
                    llm_perf_df = llm_perf_df[list(INTEL_COLUMNS_MAPPING.keys())]
         | 
| 130 | 
            +
                    llm_perf_df.rename(columns=INTEL_COLUMNS_MAPPING, inplace=True)
         | 
| 131 | 
            +
                else:
         | 
| 132 | 
            +
                    raise ValueError(f"Hardware type {hardware_type} not supported")
         | 
| 133 | 
            +
             | 
| 134 | 
             
                # sort by metric
         | 
| 135 | 
             
                llm_perf_df.sort_values(
         | 
| 136 | 
             
                    by=SORTING_COLUMNS,
         | 
|  | |
| 138 | 
             
                    inplace=True,
         | 
| 139 | 
             
                )
         | 
| 140 |  | 
| 141 | 
            +
                assert llm_perf_df["Memory (MB)"].notna().any(), "The dataset should contain at least one memory value, otherwise this implies that all the benchmarks have failed (contains only a traceback)"
         | 
| 142 | 
            +
                assert llm_perf_df.columns.is_unique, "All columns should be unique"
         | 
| 143 | 
            +
             | 
| 144 | 
             
                return llm_perf_df
         | 
| 145 |  | 
| 146 |  | 
|  | |
| 157 | 
             
                else:
         | 
| 158 | 
             
                    print(f"Dataset machine {machine} not found, downloading...")
         | 
| 159 | 
             
                    llm_perf_df = get_raw_llm_perf_df(machine, subsets, backends, hardware_type)
         | 
| 160 | 
            +
                    llm_perf_df = processed_llm_perf_df(llm_perf_df, hardware_type)
         | 
| 161 | 
             
                    llm_perf_df.to_csv(
         | 
| 162 | 
             
                        f"{DATASET_DIRECTORY}/llm-perf-leaderboard-{machine}.csv", index=False
         | 
| 163 | 
             
                    )
         | 

