Spaces:

optimum
/

llm-perf-leaderboard

Running

App Files Files Community

IlyasMoutawwakil HF Staff commited on Nov 20, 2023

Commit

ab5f5f1

1 Parent(s): 988dbd8

update

Browse files

Files changed (13) hide show

app.py +71 -344
huggy_bench.png → logo.png +0 -0
pyproject.toml +21 -0
script.py +0 -14
src/{assets/css_html_js.py → assets.py} +3 -3
src/bettertransformer.py +148 -0
src/control_panel.py +168 -0
src/flashattentionv2.py +148 -0
src/latency_score_memory.py +67 -0
src/leaderboard.py +60 -0
src/llm_perf.py +127 -0
src/{assets/text_content.py → text.py} +32 -18
src/utils.py +21 -28

app.py CHANGED Viewed

@@ -1,371 +1,98 @@
 import os
 import gradio as gr
-import pandas as pd
-import plotly.express as px
-from huggingface_hub.file_download import hf_hub_download
-from src.utils import process_model_name, process_model_arch
-from src.assets.css_html_js import custom_css
-from src.assets.text_content import (
     TITLE,
-    ABOUT_TEXT,
-    INTRODUCTION_TEXT,
-    EXAMPLE_CONFIG_TEXT,
     CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
 )
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/huggy_bench.png"
-LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
-ALL_COLUMNS_MAPPING = {
-    "Model": "Model 🤗",
-    "Arch": "Arch 🏛️",
-    "Size": "Params (B) 📏",
-    # deployment settings
-    "backend.name": "Backend 🏭",
-    "backend.torch_dtype": "Dtype 📥",
-    "optimization": "Optimization 🛠️",
-    "quantization": "Quantization 🗜️",
-    # measurements
-    "Score": "Open LLM Score (%) ⬆️",
-    "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
-    "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
-    "forward.latency(s)": "Prefill Latency (s) ⬇️",
-    "generate.latency(s)": "E2E Latency (s) ⬇️",
-    "generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
-    "generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
-    "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
-    "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
-}
-SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
-SORTING_ASCENDING = [False, False]
-ALL_COLUMNS_DATATYPES = [
-    # open llm
-    "markdown",
-    "markdown",
-    "number",
-    # deployment settings
-    "str",
-    "str",
-    "str",
-    "str",
-    # measurements
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-    "number",
-]
-# download data
-hf_hub_download(
-    repo_id="optimum/llm-perf-dataset",
-    filename="open-llm.csv",
-    local_dir="dataset",
-    repo_type="dataset",
-    token=HF_TOKEN,
-)
-OPEN_LLM_DF = pd.read_csv("dataset/open-llm.csv")
 MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
-MACHINE_TO_PERF = {}
-for machine in MACHINE_TO_HARDWARE:
-    hf_hub_download(
-        repo_id="optimum/llm-perf-dataset",
-        filename=f"{machine}/perf-report.csv",
-        local_dir="dataset",
-        repo_type="dataset",
-        token=HF_TOKEN,
-    )
-    MACHINE_TO_PERF[machine] = pd.read_csv(f"dataset/{machine}/perf-report.csv")
-def get_benchmark_df(machine="hf-dgx-01"):
-    # merge on model
-    machine_perf_df = MACHINE_TO_PERF[machine].copy()
-    merged_df = OPEN_LLM_DF.merge(machine_perf_df, left_on="Model", right_on="model")
-    # transpose energy consumption
-    merged_df["generate.energy_consumption(tokens/kWh)"] = (
-        1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
-    ).astype(int)
-    # fix nan values
-    merged_df.loc[
-        merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
-        "generate.energy_consumption(tokens/kWh)",
-    ] = pd.NA
-    # add optimization column
-    merged_df["optimization"] = merged_df[
-        ["backend.to_bettertransformer", "backend.use_flash_attention_2"]
-    ].apply(
-        lambda x: "BetterTransformer"
-        if x["backend.to_bettertransformer"]
-        else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
-        axis=1,
-    )
-    # add quantization scheme
-    merged_df["quantization"] = merged_df[
-        ["backend.quantization_scheme", "backend.quantization_config.exllama_config.version"]
-    ].apply(
-        lambda x: "BnB.4bit"
-        if x["backend.quantization_scheme"] == "bnb"
-        else (
-            "GPTQ.4bit+ExllamaV1"
-            if (x["backend.quantization_scheme"] == "gptq")
-            and (x["backend.quantization_config.exllama_config.version"] == 1)
-            else (
-                "GPTQ.4bit+ExllamaV2"
-                if (x["backend.quantization_scheme"] == "gptq")
-                and (x["backend.quantization_config.exllama_config.version"] == 2)
-                else "None"
-            )
-        ),
-        axis=1,
-    )
-    # add decode throughput
-    merged_df["decode.throughput(tokens/s)"] = (
-        1000 / (merged_df["generate.latency(s)"] - merged_df["forward.latency(s)"])
-    ).round(2)
-    # sort by metric
-    merged_df.sort_values(by=SORTING_COLUMN, ascending=SORTING_ASCENDING, inplace=True)
-    # filter columns
-    merged_df = merged_df[list(ALL_COLUMNS_MAPPING.keys())]
-    # rename columns
-    merged_df.rename(columns=ALL_COLUMNS_MAPPING, inplace=True)
-    return merged_df
-def get_benchmark_table(bench_df):
-    copy_df = bench_df.copy()
-    # transform
-    copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
-    copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
-    # process quantization
-    copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
-        lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
-        if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
-        else x["Open LLM Score (%) ⬆️"],
-        axis=1,
-    )
-    return copy_df
-def get_benchmark_chart(bench_df):
-    copy_df = bench_df.copy()
-    # transform
-    copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
-    # plot
-    fig = px.scatter(
-        copy_df,
-        y="Open LLM Score (%) ⬆️",
-        x="E2E Latency (s) ⬇️",
-        size="Allocated Memory (MB) ⬇️",
-        color="Arch 🏛️",
-        custom_data=list(ALL_COLUMNS_MAPPING.values()),
-        color_discrete_sequence=px.colors.qualitative.Light24,
-    )
-    fig.update_layout(
-        title={
-            "text": "Latency vs. Score vs. Memory",
-            "y": 0.95,
-            "x": 0.5,
-            "xanchor": "center",
-            "yanchor": "top",
-        },
-        xaxis_title="Per 1000 Tokens Latency (s)",
-        yaxis_title="Open LLM Score (%)",
-        legend_title="LLM Architecture",
-        width=1200,
-        height=600,
-    )
-    fig.update_traces(
-        hovertemplate="<br>".join(
-            [
-                f"<b>{column}:</b> %{{customdata[{i}]}}"
-                for i, column in enumerate(ALL_COLUMNS_MAPPING.values())
-            ]
-        )
-    )
-    return fig
-def filter_query(
-    text,
-    backends,
-    datatypes,
-    optimizations,
-    quantizations,
-    score,
-    memory,
-    machine,
-):
-    raw_df = get_benchmark_df(machine=machine)
-    filtered_df = raw_df[
-        raw_df["Model 🤗"].str.contains(text, case=False)
-        & raw_df["Backend 🏭"].isin(backends)
-        & raw_df["Dtype 📥"].isin(datatypes)
-        & raw_df["Optimization 🛠️"].isin(optimizations)
-        & raw_df["Quantization 🗜️"].isin(quantizations)
-        & (raw_df["Open LLM Score (%) ⬆️"] >= score)
-        & (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
-    ]
-    filtered_table = get_benchmark_table(filtered_df)
-    filtered_chart = get_benchmark_chart(filtered_df)
-    return filtered_table, filtered_chart
-# Demo interface
 demo = gr.Blocks(css=custom_css)
 with demo:
-    # logo
     gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
-    # leaderboard title
-    gr.HTML(TITLE)
-    # introduction text
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="descriptive-text")
-    with gr.Tabs(elem_classes="leaderboard-tabs"):
-        machine_placeholders = {}
-        machine_tables = {}
-        machine_plots = {}
-        ####################### HARDWARE TABS #######################
-        for i, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
-            # dummy placeholder of the machine name
-            machine_placeholders[machine] = gr.Textbox(value=machine, visible=False)
-            with gr.TabItem(hardware, id=i):
-                with gr.Tabs(elem_classes="machine-tabs"):
-                    # placeholder for full dataframe
-                    machine_df = get_benchmark_df(machine=machine)
                     with gr.TabItem("Leaderboard 🏅", id=0):
-                        gr.HTML(
-                            "👉 Scroll to the right 👉 for additional columns.",
-                            elem_id="descriptive-text",
-                        )
-                        # Original leaderboard table
-                        machine_tables[machine] = gr.components.Dataframe(
-                            value=get_benchmark_table(machine_df),
-                            headers=list(ALL_COLUMNS_MAPPING.values()),
-                            datatype=ALL_COLUMNS_DATATYPES,
-                            elem_id="machine-table",
-                        )
-                    with gr.TabItem("Plot 📊", id=1):
-                        gr.HTML(
-                            "👆 Hover over the points 👆 for additional information.",
-                            elem_id="descriptive-text",
-                        )
-                        # Original leaderboard plot
-                        machine_plots[machine] = gr.components.Plot(
-                            value=get_benchmark_chart(machine_df),
-                            elem_id="machine-plot",
-                            show_label=False,
-                        )
-        ###################### CONTROL PANEL #######################
-        with gr.TabItem("Control Panel 🎛️", id=2):
-            gr.HTML(
-                "Use this control panel to filter the leaderboard's table and plot.",  # noqa: E501
-                elem_id="descriptive-text",
-            )
-            with gr.Row():
-                with gr.Column():
-                    search_bar = gr.Textbox(
-                        label="Model 🤗",
-                        info="🔍 Search for a model name",
-                        elem_id="search-bar",
-                    )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    score_slider = gr.Slider(
-                        label="Open LLM Score (%) 📈",
-                        info="🎚️ Slide to minimum Open LLM score",
-                        value=0,
-                        elem_id="threshold-slider",
-                    )
-                with gr.Column(scale=1):
-                    memory_slider = gr.Slider(
-                        label="Peak Memory (MB) 📈",
-                        info="🎚️ Slide to maximum Peak Memory",
-                        minimum=0,
-                        maximum=80 * 1024,
-                        value=80 * 1024,
-                        elem_id="memory-slider",
-                    )
-                with gr.Column(scale=1):
-                    backend_checkboxes = gr.CheckboxGroup(
-                        label="Backends 🏭",
-                        choices=["pytorch", "onnxruntime"],
-                        value=["pytorch", "onnxruntime"],
-                        info="☑️ Select the backends",
-                        elem_id="backend-checkboxes",
-                    )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    datatype_checkboxes = gr.CheckboxGroup(
-                        label="Load Dtypes 📥",
-                        choices=["float32", "float16"],
-                        value=["float32", "float16"],
-                        info="☑️ Select the load dtypes",
-                        elem_id="dtype-checkboxes",
-                    )
-                with gr.Column(scale=1):
-                    optimization_checkboxes = gr.CheckboxGroup(
-                        label="Optimizations 🛠️",
-                        choices=["None", "BetterTransformer", "FlashAttentionV2"],
-                        value=["None", "BetterTransformer", "FlashAttentionV2"],
-                        info="☑️ Select the optimization",
-                        elem_id="optimization-checkboxes",
-                    )
-                with gr.Column(scale=1):
-                    quantization_checkboxes = gr.CheckboxGroup(
-                        label="Quantizations 🗜️",
-                        choices=["None", "BnB.4bit", "GPTQ.4bit"],
-                        value=["None", "BnB.4bit", "GPTQ.4bit"],
-                        info="☑️ Select the quantization schemes",
-                        elem_id="quantization-checkboxes",
-                    )
-            with gr.Row():
-                filter_button = gr.Button(
-                    value="Filter 🚀",
-                    elem_id="filter-button",
-                )
-            for machine in MACHINE_TO_HARDWARE:
-                filter_button.click(
-                    filter_query,
-                    [
-                        search_bar,
-                        backend_checkboxes,
-                        datatype_checkboxes,
-                        optimization_checkboxes,
-                        quantization_checkboxes,
-                        score_slider,
-                        memory_slider,
-                        machine_placeholders[machine],
-                    ],
-                    [machine_tables[machine], machine_plots[machine]],
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):
-            gr.HTML(ABOUT_TEXT, elem_classes="descriptive-text")
-            gr.Markdown(EXAMPLE_CONFIG_TEXT, elem_classes="descriptive-text")
-    ####################### CITATION #######################
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
-# Launch demo
-demo.queue().launch()

 import os
 import gradio as gr
+from src.control_panel import create_control_panel, create_control_callback
+from src.latency_score_memory import create_lat_score_mem_plot
+from src.leaderboard import create_leaderboard_table
+from src.flashattentionv2 import create_fa2_plots
+from src.bettertransformer import create_bt_plots
+from src.llm_perf import get_llm_perf_df
+from src.assets import custom_css
+from src.text import (
     TITLE,
+    ABOUT,
+    INTRODUCTION,
+    EXAMPLE_CONFIG,
+    CITATION_BUTTON,
     CITATION_BUTTON_LABEL,
 )
+LOGO_URL = "https://huggingface.co/spaces/optimum/llm-perf-leaderboard/resolve/main/logo.png"
 MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
 demo = gr.Blocks(css=custom_css)
 with demo:
+    gr.HTML(TITLE, elem_classes="title")
     gr.HTML(f'<img src="{LOGO_URL}">', elem_classes="logo")
+    gr.Markdown(INTRODUCTION, elem_classes="descriptive-text")
+    ####################### HARDWARE TABS #######################
+    with gr.Tabs(elem_classes="tabs"):
+        for id, (machine, hardware) in enumerate(MACHINE_TO_HARDWARE.items()):
+            with gr.TabItem(hardware, id=id):
+                ####################### CONTROL PANEL #######################
+                (
+                    filter_button,
+                    machine_textbox,
+                    search_bar,
+                    score_slider,
+                    memory_slider,
+                    backend_checkboxes,
+                    datatype_checkboxes,
+                    optimization_checkboxes,
+                    quantization_checkboxes,
+                ) = create_control_panel()
+                ####################### HARDWARE SUBTABS #######################
+                with gr.Tabs(elem_classes="subtabs"):
+                    llm_perf_df = get_llm_perf_df(machine=machine)
+                    ####################### LEADERBOARD TAB #######################
                     with gr.TabItem("Leaderboard 🏅", id=0):
+                        leaderboard_table = create_leaderboard_table(llm_perf_df)
+                    ####################### LAT. vs. SCORE vs. MEM. TAB #######################
+                    with gr.TabItem("Latency vs. Score vs. Memory 📊", id=1):
+                        lat_score_mem_plot = create_lat_score_mem_plot(llm_perf_df)
+                    ####################### BETTERTRANSFORMER SPEEDUP TAB #######################
+                    with gr.TabItem("BetterTransformer Speedup 📈", id=2):
+                        bt_prefill_plot, bt_decode_plot = create_bt_plots(llm_perf_df)
+                    with gr.TabItem("FlashAttentionV2 Speedup 📈", id=3):
+                        fa2_prefill_plot, fa2_decode_plot = create_fa2_plots(llm_perf_df)
+                ####################### CONTROL CALLBACK #######################
+                create_control_callback(
+                    filter_button,
+                    # inputs
+                    machine_textbox,
+                    search_bar,
+                    score_slider,
+                    memory_slider,
+                    backend_checkboxes,
+                    datatype_checkboxes,
+                    optimization_checkboxes,
+                    quantization_checkboxes,
+                    # outputs
+                    leaderboard_table,
+                    lat_score_mem_plot,
+                    bt_prefill_plot,
+                    bt_decode_plot,
+                    fa2_prefill_plot,
+                    fa2_decode_plot,
                 )
         ####################### ABOUT TAB #######################
         with gr.TabItem("About 📖", id=3):
+            gr.HTML(ABOUT, elem_classes="descriptive-text")
+            gr.Markdown(EXAMPLE_CONFIG, elem_classes="descriptive-text")
+    ####################### CITATION
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):
             citation_button = gr.Textbox(
+                value=CITATION_BUTTON,
                 label=CITATION_BUTTON_LABEL,
                 elem_id="citation-button",
                 show_copy_button=True,
             )
+if __name__ == "__main__":
+    # Launch demo
+    demo.queue().launch()

huggy_bench.png → logo.png RENAMED Viewed

File without changes

pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+#  Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+[tool.black]
+line-length = 119
+target-version = ['py37']
+[tool.ruff]
+ignore = ["E501", "C901"]
+select = ["C", "E", "F", "I", "W"]

script.py DELETED Viewed

@@ -1,14 +0,0 @@
-from huggingface_hub import hf_hub_download
-import pandas as pd
-hf_hub_download(
-    repo_id="optimum/llm-perf-dataset",
-    filename="open-llm.csv",
-    local_dir="dataset",
-    repo_type="dataset",
-)
-open_llm = pd.read_csv("dataset/open-llm.csv")
-print(open_llm["Arch"].unique())
-print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())

src/{assets/css_html_js.py → assets.py} RENAMED Viewed

@@ -6,14 +6,14 @@ custom_css = """
     max-width: 100%
     object-fit: contain;
 }
-.descriptive-text {
     font-size: 16px !important;
 }
-.leaderboard-tabs button {
     font-size: 20px;
 }
-.hardware-tabs button {
     font-size: 20px;
 }

     max-width: 100%
     object-fit: contain;
 }
+.text {
     font-size: 16px !important;
 }
+.tabs button {
     font-size: 20px;
 }
+.subtabs button {
     font-size: 20px;
 }

src/bettertransformer.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from src.utils import process_arch
+BETTERTRANSFORMER_DATA = [
+    # open llm
+    "Model 🤗",
+    "Arch 🏛️",
+    "DType 📥",
+    "Backend 🏭",
+    "Params (B)",
+    "Open LLM Score (%)",
+    # deployment settings
+    "DType 📥",
+    "Backend 🏭",
+    "Quantization 🗜️",
+    # primary measurements
+    "Prefill Latency (s)",
+    "Prefill Latency (s) BetterTransformer",
+    "Decode Throughput (tokens/s)",
+    "Decode Throughput (tokens/s) BetterTransformer",
+    "E2E Throughput (tokens/s)",
+    "E2E Throughput (tokens/s) BetterTransformer",
+    # speedups
+    "Prefill Latency Speedup (%)",
+    "Decode Throughput Speedup (%)",
+]
+def get_bt_df(llm_perf_df):
+    bt_df = llm_perf_df.copy()
+    # process
+    bt_df["Arch 🏛️"] = bt_df["Arch 🏛️"].apply(process_arch)
+    # seperate original model experiments from BetterTransformer experiments
+    original_df = bt_df[bt_df["Optimization 🛠️"] == "None"]
+    bt_df = bt_df[bt_df["Optimization 🛠️"] == "BetterTransformer"]
+    # merge the two dataframes
+    bt_df = pd.merge(
+        original_df,
+        bt_df,
+        on=["Model 🤗", "Quantization 🗜️"],
+        suffixes=["", " BetterTransformer"],
+    )
+    # compute speedups
+    bt_df["Prefill Latency Speedup (%)"] = (
+        (bt_df["Prefill Latency (s)"] / bt_df["Prefill Latency (s) BetterTransformer"]) * 100
+    ).round(2)
+    bt_df["Decode Throughput Speedup (%)"] = (
+        (bt_df["Decode Throughput (tokens/s) BetterTransformer"] / bt_df["Decode Throughput (tokens/s)"]) * 100
+    ).round(2)
+    # filter speedups > 1000%
+    bt_df = bt_df[bt_df["Prefill Latency Speedup (%)"] < 1000]
+    bt_df = bt_df[bt_df["Decode Throughput Speedup (%)"] < 1000]
+    return bt_df
+def get_bt_decode_fig(llm_perf_df):
+    bt_df = get_bt_df(llm_perf_df)
+    # plot
+    decode_fig = px.box(
+        bt_df,
+        x="Arch 🏛️",
+        y="Decode Throughput Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=BETTERTRANSFORMER_DATA,
+        color="Quantization 🗜️",
+        points="all",
+    )
+    # add hover data
+    decode_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
+        )
+    )
+    # add layout
+    decode_fig.update_layout(
+        title={
+            "text": "Decode Throughput Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Decode Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return decode_fig
+def get_bt_prefill_fig(llm_perf_df):
+    bt_df = get_bt_df(llm_perf_df)
+    # plot
+    prefill_fig = px.box(
+        bt_df,
+        x="Arch 🏛️",
+        y="Prefill Latency Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=BETTERTRANSFORMER_DATA,
+        color="Quantization 🗜️",
+        points="all",
+    )
+    # add hover data
+    prefill_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(BETTERTRANSFORMER_DATA)]
+        )
+    )
+    # add layout
+    prefill_fig.update_layout(
+        title={
+            "text": "Prefill Latency Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Prefill Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return prefill_fig
+def create_bt_plots(llm_perf_df):
+    # descriptive text
+    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
+    # get figures
+    prefill_fig = get_bt_prefill_fig(llm_perf_df)
+    decode_fig = get_bt_decode_fig(llm_perf_df)
+    # create plots
+    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
+    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
+    return prefill_plot, decode_plot

src/control_panel.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import gradio as gr
+from src.llm_perf import get_llm_perf_df
+from src.leaderboard import get_leaderboard_df
+from src.latency_score_memory import get_lat_score_mem_fig
+from src.bettertransformer import get_bt_prefill_fig, get_bt_decode_fig
+from src.flashattentionv2 import get_fa2_prefill_fig, get_fa2_decode_fig
+def create_control_panel(machine: str = "hf-dgx-01"):
+    # descriptive text
+    gr.HTML("Use this control panel to filter this leaderboard.", elem_id="text")
+    # controls
+    machine_textbox = gr.Textbox(value=machine, visible=False)
+    with gr.Row():
+        with gr.Column():
+            search_bar = gr.Textbox(
+                label="Model 🤗",
+                info="🔍 Search for a model name",
+                elem_id="search-bar",
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            score_slider = gr.Slider(
+                label="Open LLM Score (%) 📈",
+                info="🎚️ Slide to minimum Open LLM score",
+                value=0,
+                elem_id="threshold-slider",
+            )
+        with gr.Column(scale=1):
+            memory_slider = gr.Slider(
+                label="Peak Memory (MB) 📈",
+                info="🎚️ Slide to maximum Peak Memory",
+                minimum=0,
+                maximum=80 * 1024,
+                value=80 * 1024,
+                elem_id="memory-slider",
+            )
+        with gr.Column(scale=1):
+            backend_checkboxes = gr.CheckboxGroup(
+                label="Backends 🏭",
+                choices=["pytorch", "onnxruntime"],
+                value=["pytorch", "onnxruntime"],
+                info="☑️ Select the backends",
+                elem_id="backend-checkboxes",
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            datatype_checkboxes = gr.CheckboxGroup(
+                label="DTypes 📥",
+                choices=["float32", "float16"],
+                value=["float32", "float16"],
+                info="☑️ Select the load data types",
+                elem_id="dtype-checkboxes",
+            )
+        with gr.Column(scale=1):
+            optimization_checkboxes = gr.CheckboxGroup(
+                label="Optimizations 🛠️",
+                choices=["None", "BetterTransformer", "FlashAttentionV2"],
+                value=["None", "BetterTransformer", "FlashAttentionV2"],
+                info="☑️ Select the optimization",
+                elem_id="optimization-checkboxes",
+            )
+        with gr.Column(scale=1):
+            quantization_checkboxes = gr.CheckboxGroup(
+                label="Quantizations 🗜️",
+                choices=["None", "BnB.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
+                value=["None", "BnB.4bit", "GPTQ.4bit", "GPTQ.4bit+ExllamaV1", "GPTQ.4bit+ExllamaV2"],
+                info="☑️ Select the quantization schemes",
+                elem_id="quantization-checkboxes",
+            )
+    with gr.Row():
+        filter_button = gr.Button(
+            value="Filter 🚀",
+            elem_id="filter-button",
+        )
+    return (
+        filter_button,
+        machine_textbox,
+        search_bar,
+        score_slider,
+        memory_slider,
+        backend_checkboxes,
+        datatype_checkboxes,
+        optimization_checkboxes,
+        quantization_checkboxes,
+    )
+def filter_fn(
+    machine,
+    model,
+    backends,
+    datatypes,
+    optimizations,
+    quantizations,
+    score,
+    memory,
+):
+    raw_df = get_llm_perf_df(machine=machine)
+    filtered_df = raw_df[
+        raw_df["Model 🤗"].str.contains(model, case=False)
+        & raw_df["Backend 🏭"].isin(backends)
+        & raw_df["DType 📥"].isin(datatypes)
+        & raw_df["Optimization 🛠️"].isin(optimizations)
+        & raw_df["Quantization 🗜️"].isin(quantizations)
+        & (raw_df["Open LLM Score (%)"] >= score)
+        & (raw_df["Allocated Memory (MB)"] <= memory)
+    ]
+    filtered_leaderboard_df = get_leaderboard_df(filtered_df)
+    filtered_lat_score_mem_fig = get_lat_score_mem_fig(filtered_df)
+    filtered_bt_prefill_fig = get_bt_prefill_fig(filtered_df)
+    filtered_bt_decode_fig = get_bt_decode_fig(filtered_df)
+    filtered_fa2_prefill_fig = get_fa2_prefill_fig(filtered_df)
+    filtered_fa2_decode_fig = get_fa2_decode_fig(filtered_df)
+    return [
+        filtered_leaderboard_df,
+        filtered_lat_score_mem_fig,
+        filtered_bt_prefill_fig,
+        filtered_bt_decode_fig,
+        filtered_fa2_prefill_fig,
+        filtered_fa2_decode_fig,
+    ]
+def create_control_callback(
+    # button
+    filter_button,
+    # inputs
+    machine_textbox,
+    search_bar,
+    score_slider,
+    memory_slider,
+    backend_checkboxes,
+    datatype_checkboxes,
+    optimization_checkboxes,
+    quantization_checkboxes,
+    # outputs
+    leaderboard_table,
+    lat_score_mem_plot,
+    bt_prefill_plot,
+    bt_decode_plot,
+    fa2_prefill_plot,
+    fa2_decode_plot,
+):
+    filter_button.click(
+        fn=filter_fn,
+        inputs=[
+            machine_textbox,
+            search_bar,
+            backend_checkboxes,
+            datatype_checkboxes,
+            optimization_checkboxes,
+            quantization_checkboxes,
+            score_slider,
+            memory_slider,
+        ],
+        outputs=[
+            leaderboard_table,
+            lat_score_mem_plot,
+            bt_prefill_plot,
+            bt_decode_plot,
+            fa2_prefill_plot,
+            fa2_decode_plot,
+        ],
+    )

src/flashattentionv2.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import gradio as gr
+import pandas as pd
+import plotly.express as px
+from src.utils import process_arch
+FLASHATTENTIONV2_DATA = [
+    # open llm
+    "Model 🤗",
+    "Arch 🏛️",
+    "DType 📥",
+    "Backend 🏭",
+    "Params (B)",
+    "Open LLM Score (%)",
+    # deployment settings
+    "DType 📥",
+    "Backend 🏭",
+    "Quantization 🗜️",
+    # primary measurements
+    "Prefill Latency (s)",
+    "Prefill Latency (s) FlashAttentionV2",
+    "Decode Throughput (tokens/s)",
+    "Decode Throughput (tokens/s) FlashAttentionV2",
+    "E2E Throughput (tokens/s)",
+    "E2E Throughput (tokens/s) FlashAttentionV2",
+    # speedups
+    "Prefill Latency Speedup (%)",
+    "Decode Throughput Speedup (%)",
+]
+def get_fa2_df(llm_perf_df):
+    fa2_df = llm_perf_df.copy()
+    # process
+    fa2_df["Arch 🏛️"] = fa2_df["Arch 🏛️"].apply(process_arch)
+    # seperate original model experiments from FlashAttentionV2 experiments
+    original_df = fa2_df[fa2_df["Optimization 🛠️"] == "None"]
+    fa2_df = fa2_df[fa2_df["Optimization 🛠️"] == "FlashAttentionV2"]
+    # merge the two dataframes
+    fa2_df = pd.merge(
+        original_df,
+        fa2_df,
+        on=["Model 🤗", "Quantization 🗜️"],
+        suffixes=["", " FlashAttentionV2"],
+    )
+    # compute speedups
+    fa2_df["Prefill Latency Speedup (%)"] = (
+        (fa2_df["Prefill Latency (s)"] / fa2_df["Prefill Latency (s) FlashAttentionV2"]) * 100
+    ).round(2)
+    fa2_df["Decode Throughput Speedup (%)"] = (
+        (fa2_df["Decode Throughput (tokens/s) FlashAttentionV2"] / fa2_df["Decode Throughput (tokens/s)"]) * 100
+    ).round(2)
+    # filter speedups > 1000%
+    fa2_df = fa2_df[fa2_df["Prefill Latency Speedup (%)"] < 1000]
+    fa2_df = fa2_df[fa2_df["Decode Throughput Speedup (%)"] < 1000]
+    return fa2_df
+def get_fa2_decode_fig(llm_perf_df):
+    fa2_df = get_fa2_df(llm_perf_df)
+    # plot
+    decode_fig = px.box(
+        fa2_df,
+        x="Arch 🏛️",
+        y="Decode Throughput Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=FLASHATTENTIONV2_DATA,
+        color="Quantization 🗜️",
+        points="all",
+    )
+    # add hover data
+    decode_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
+        )
+    )
+    # add layout
+    decode_fig.update_layout(
+        title={
+            "text": "Decode Throughput Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Decode Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return decode_fig
+def get_fa2_prefill_fig(llm_perf_df):
+    fa2_df = get_fa2_df(llm_perf_df)
+    # plot
+    prefill_fig = px.box(
+        fa2_df,
+        x="Arch 🏛️",
+        y="Prefill Latency Speedup (%)",
+        color_discrete_sequence=px.colors.qualitative.Light24,
+        custom_data=FLASHATTENTIONV2_DATA,
+        color="Quantization 🗜️",
+        points="all",
+    )
+    # add hover data
+    prefill_fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(FLASHATTENTIONV2_DATA)]
+        )
+    )
+    # add layout
+    prefill_fig.update_layout(
+        title={
+            "text": "Prefill Latency Speedup per Architecture",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="LLM Architecture",
+        yaxis_title="Prefill Speedup (%)",
+        legend_title="Quantization Scheme",
+        width=1200,
+        height=600,
+    )
+    return prefill_fig
+def create_fa2_plots(llm_perf_df):
+    # descriptive text
+    gr.HTML("👆 Hover over the points 👆 for additional information.", elem_id="text")
+    # get figures
+    prefill_fig = get_fa2_prefill_fig(llm_perf_df)
+    decode_fig = get_fa2_decode_fig(llm_perf_df)
+    # create plots
+    prefill_plot = gr.components.Plot(value=prefill_fig, elem_id="plot", show_label=False)
+    decode_plot = gr.components.Plot(value=decode_fig, elem_id="plot", show_label=False)
+    return prefill_plot, decode_plot

src/latency_score_memory.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+import plotly.express as px
+SCORE_MEMORY_LATENCY_DATA = [
+    "Model 🤗",
+    "Arch 🏛️",
+    "Params (B)",
+    "DType 📥",
+    "Backend 🏭",
+    "Open LLM Score (%)",
+    "Prefill Latency (s)",
+    "Decode Throughput (tokens/s)",
+    "Allocated Memory (MB)",
+    "E2E Latency (s)",
+    "E2E Throughput (tokens/s)",
+]
+def get_lat_score_mem_fig(llm_perf_df):
+    copy_df = llm_perf_df.copy()
+    # plot
+    fig = px.scatter(
+        copy_df,
+        x="E2E Latency (s)",
+        y="Open LLM Score (%)",
+        size="Allocated Memory (MB)",
+        color="Arch 🏛️",
+        custom_data=SCORE_MEMORY_LATENCY_DATA,
+        color_discrete_sequence=px.colors.qualitative.Light24,
+    )
+    fig.update_traces(
+        hovertemplate="<br>".join(
+            [f"<b>{column}:</b> %{{customdata[{i}]}}" for i, column in enumerate(SCORE_MEMORY_LATENCY_DATA)]
+        )
+    )
+    fig.update_layout(
+        title={
+            "text": "Latency vs. Score vs. Memory",
+            "y": 0.95,
+            "x": 0.5,
+            "xanchor": "center",
+            "yanchor": "top",
+        },
+        xaxis_title="Per 1000 Tokens Latency (s)",
+        yaxis_title="Open LLM Score (%)",
+        legend_title="LLM Architecture",
+        width=1200,
+        height=600,
+    )
+    return fig
+def create_lat_score_mem_plot(llm_perf_df):
+    # descriptive text
+    gr.HTML("👆 Hover over the points 👆 for additional information. ",elem_id="text")
+    # get figure
+    fig = get_lat_score_mem_fig(llm_perf_df)
+    # create plot
+    plot = gr.components.Plot(
+        value=fig,
+        elem_id="plot",
+        show_label=False,
+    )
+    return plot

src/leaderboard.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import gradio as gr
+from src.utils import model_hyperlink, process_score
+LEADERBOARD_COLUMN_TO_DATATYPE = {
+    # open llm
+    "Model 🤗" :"markdown",
+    "Arch 🏛️" :"markdown",
+    "Params (B)": "number",
+    "Open LLM Score (%)": "number",
+    # deployment settings
+    "DType 📥" :"str",
+    "Backend 🏭" :"str",
+    "Optimization 🛠️" :"str",
+    "Quantization 🗜️" :"str",
+    # primary measurements
+    "Prefill Latency (s)": "number",
+    "Decode Throughput (tokens/s)": "number",
+    "Allocated Memory (MB)": "number",
+    "Energy (tokens/kWh)": "number",
+    # additional measurements
+    "E2E Latency (s)": "number",
+    "E2E Throughput (tokens/s)": "number",
+    "Reserved Memory (MB)": "number",
+    "Used Memory (MB)": "number",
+}
+def process_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def get_leaderboard_df(llm_perf_df):
+    df = llm_perf_df.copy()
+    # transform for leaderboard
+    df["Model 🤗"] = df["Model 🤗"].apply(process_model)
+    # process quantization for leaderboard
+    df["Open LLM Score (%)"] = df.apply(
+        lambda x: process_score(x["Open LLM Score (%)"], x["Quantization 🗜️"]),
+        axis=1,
+    )
+    return df
+def create_leaderboard_table(llm_perf_df):
+    # descriptive text
+    gr.HTML("👉 Scroll to the right 👉 for additional columns.", elem_id="text")
+    # get dataframe
+    leaderboard_df = get_leaderboard_df(llm_perf_df)
+    # create table
+    leaderboard_table = gr.components.Dataframe(
+        value=leaderboard_df,
+        datatype=list(LEADERBOARD_COLUMN_TO_DATATYPE.values()),
+        headers=list(LEADERBOARD_COLUMN_TO_DATATYPE.keys()),
+        elem_id="table",
+    )
+    return leaderboard_table

src/llm_perf.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import pandas as pd
+from huggingface_hub import hf_hub_download
+LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+COLUMNS_MAPPING = {
+    "Model": "Model 🤗",
+    "Arch": "Arch 🏛️",
+    "Size": "Params (B)",
+    "Score": "Open LLM Score (%)",
+    # deployment settings
+    "backend.name": "Backend 🏭",
+    "backend.torch_dtype": "DType 📥",
+    "optimization": "Optimization 🛠️",
+    "quantization": "Quantization 🗜️",
+    # primary measurements
+    "forward.latency(s)": "Prefill Latency (s)",
+    "decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
+    "generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
+    "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
+    # additional measurements
+    "generate.latency(s)": "E2E Latency (s)",
+    "generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
+    "generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
+    "generate.max_memory_used(MB)": "Used Memory (MB)",
+}
+SORTING_COLUMNS = [
+    "Open LLM Score (%)",
+    "Prefill Latency (s)",
+    "Decode Throughput (tokens/s)",
+]
+SORTING_ASCENDING = [False, True, False]
+def get_llm_df():
+    hf_hub_download(
+        repo_id=LLM_PERF_DATASET_REPO,
+        filename="open-llm.csv",
+        local_dir="dataset",
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    llm_df = pd.read_csv("dataset/open-llm.csv")
+    return llm_df
+def get_perf_df(machine: str = "hf-dgx-01"):
+    hf_hub_download(
+        repo_id=LLM_PERF_DATASET_REPO,
+        filename=f"{machine}/perf-report.csv",
+        local_dir="dataset",
+        repo_type="dataset",
+        token=HF_TOKEN,
+    )
+    perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")
+    return perf_df
+def get_llm_perf_df(machine: str = "hf-dgx-01"):
+    # get dataframes
+    llm_df = get_llm_df()
+    perf_df = get_perf_df(machine=machine)
+    llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
+    # some assertions
+    assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
+    assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
+    assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
+    # transpose energy consumption
+    llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
+        1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
+    ).astype(int)
+    # fix nan values
+    llm_perf_df.loc[
+        llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
+        "generate.energy_consumption(tokens/kWh)",
+    ] = pd.NA
+    # add optimization column
+    llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
+        lambda x: "BetterTransformer"
+        if x["backend.to_bettertransformer"]
+        else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
+        axis=1,
+    )
+    # add quantization scheme
+    llm_perf_df["quantization"] = llm_perf_df[
+        [
+            "backend.quantization_scheme",
+            "backend.quantization_config.exllama_config.version",
+        ]
+    ].apply(
+        lambda x: "BnB.4bit"
+        if x["backend.quantization_scheme"] == "bnb"
+        else (
+            "GPTQ.4bit+ExllamaV1"
+            if (x["backend.quantization_scheme"] == "gptq")
+            and (x["backend.quantization_config.exllama_config.version"] == 1)
+            else (
+                "GPTQ.4bit+ExllamaV2"
+                if (x["backend.quantization_scheme"] == "gptq")
+                and (x["backend.quantization_config.exllama_config.version"] == 2)
+                else "None"
+            )
+        ),
+        axis=1,
+    )
+    # add decode throughput
+    llm_perf_df["decode.throughput(tokens/s)"] = (
+        1000 / (llm_perf_df["generate.latency(s)"] - llm_perf_df["forward.latency(s)"])
+    ).round(2)
+    # filter columns
+    llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
+    # rename columns
+    llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
+    # sort by metric
+    llm_perf_df.sort_values(
+        by=SORTING_COLUMNS,
+        ascending=SORTING_ASCENDING,
+        inplace=True,
+    )
+    return llm_perf_df

src/{assets/text_content.py → text.py} RENAMED Viewed

@@ -1,6 +1,6 @@
 TITLE = """<h1 align="center" id="space-title">🤗 LLM-Perf Leaderboard 🏋️</h1>"""
-INTRODUCTION_TEXT = f"""
 The 🤗 LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
@@ -8,7 +8,7 @@ Anyone from the community can request a model or a hardware/backend/optimization
 - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
 """
-ABOUT_TEXT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
@@ -18,11 +18,26 @@ ABOUT_TEXT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 </ul>
 """
-EXAMPLE_CONFIG_TEXT = """
 Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
 ```yaml
 defaults:
-  - backend: pytorch # default backend
   - benchmark: inference # default benchmark
   - experiment # inheriting from experiment config
   - _self_ # for hydra 1.1 compatibility
@@ -31,39 +46,38 @@ defaults:
 hydra:
   run:
-    dir: llm-experiments/{experiment_name}
   job:
     chdir: true
-experiment_name: {experiment_name}
-model: {model}
-device: cuda
 backend:
-  no_weights: true
-  torch_dtype: float16
-  bettertransformer: true
-  quantization_scheme: gptq
 benchmark:
   memory: true
   energy: true
   new_tokens: 1000
   input_shapes:
     batch_size: 1
     sequence_length: 256
 ```
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
-CITATION_BUTTON_TEXT = r"""@misc{llm-perf-leaderboard,
   author = {Ilyas Moutawwakil, Régis Pierrard},
   title = {LLM-Perf Leaderboard},
   year = {2023},

 TITLE = """<h1 align="center" id="space-title">🤗 LLM-Perf Leaderboard 🏋️</h1>"""
+INTRODUCTION = """
 The 🤗 LLM-Perf Leaderboard 🏋️ aims to benchmark the performance (latency, throughput, memory & energy) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
 Anyone from the community can request a model or a hardware/backend/optimization configuration for automated benchmarking:
 - Hardware/Backend/Optimization performance requests should be made in the [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) to assess their relevance and feasibility.
 """
+ABOUT = """<h3>About the 🤗 LLM-Perf Leaderboard 🏋️</h3>
 <ul>
     <li>To avoid communication-dependent results, only one GPU is used.</li>
     <li>Score is the average evaluation score obtained from the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard">🤗 Open LLM Leaderboard</a>.</li>
 </ul>
 """
+EXAMPLE_CONFIG = """
 Here's an example of the configuration file used to benchmark the models with Optimum-Benchmark:
 ```yaml
 defaults:
+  - backend: pytorch
+  - _base_ # inheriting from base config
+  - _self_ # for hydra 1.1 compatibility
+experiment_name: pytorch+cuda+float16+bettertransformer
+device: cuda
+backend:
+  no_weights: true
+  torch_dtype: float16
+  to_bettertransformer: true
+```
+Where the base config is:
+```yaml
+defaults:
   - benchmark: inference # default benchmark
   - experiment # inheriting from experiment config
   - _self_ # for hydra 1.1 compatibility
 hydra:
   run:
+    dir: ???
   job:
     chdir: true
+    env_set:
+      CUDA_VISIBLE_DEVICES: 0
+      CUDA_DEVICE_ORDER: PCI_BUS_ID
+model: ???
+experiment_name: ???
 backend:
+  initial_isolation_check: true
+  continous_isolation_check: true
 benchmark:
+  duration: 10
   memory: true
   energy: true
   new_tokens: 1000
   input_shapes:
     batch_size: 1
     sequence_length: 256
+hub_kwargs:
+  trust_remote_code: true
 ```
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
+CITATION_BUTTON = r"""@misc{llm-perf-leaderboard,
   author = {Ilyas Moutawwakil, Régis Pierrard},
   title = {LLM-Perf Leaderboard},
   year = {2023},

src/utils.py CHANGED Viewed

@@ -1,22 +1,3 @@
-from huggingface_hub import HfApi, Repository
-import gradio as gr
-import json
-def change_tab(query_param):
-    query_param = query_param.replace("'", '"')
-    query_param = json.loads(query_param)
-    if (
-        isinstance(query_param, dict)
-        and "tab" in query_param
-        and query_param["tab"] == "plot"
-    ):
-        return gr.Tabs.update(selected=1)
-    else:
-        return gr.Tabs.update(selected=0)
 LLM_MODEL_ARCHS = {
     "stablelm_epoch": "🔴 StableLM-Epoch",
     "stablelm_alpha": "🔴 StableLM-Alpha",
@@ -24,8 +5,8 @@ LLM_MODEL_ARCHS = {
     "RefinedWebModel": "🦅 Falcon",
     "gpt_bigcode": "⭐ StarCoder",
     "RefinedWeb": "🦅 Falcon",
-    "baichuan": "🌊 Baichuan 百川", # river
-    "internlm": "🧑‍🎓 InternLM 书生", # scholar
     "mistral": "Ⓜ️ Mistral",
     "codegen": "♾️ CodeGen",
     "chatglm": "💬 ChatGLM",
@@ -34,7 +15,7 @@ LLM_MODEL_ARCHS = {
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
     "mpt": "🧱 MPT",
-    "Yi": "🫂 Yi 人", # people
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
@@ -50,13 +31,25 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def process_model_name(model_name):
-    link = f"https://huggingface.co/{model_name}"
-    return model_hyperlink(link, model_name)
-def process_model_arch(model_arch):
     if model_arch in LLM_MODEL_ARCHS:
         return LLM_MODEL_ARCHS[model_arch]
     else:
         return model_arch

 LLM_MODEL_ARCHS = {
     "stablelm_epoch": "🔴 StableLM-Epoch",
     "stablelm_alpha": "🔴 StableLM-Alpha",
     "RefinedWebModel": "🦅 Falcon",
     "gpt_bigcode": "⭐ StarCoder",
     "RefinedWeb": "🦅 Falcon",
+    "baichuan": "🌊 Baichuan 百川",  # river
+    "internlm": "🧑‍🎓 InternLM 书生",  # scholar
     "mistral": "Ⓜ️ Mistral",
     "codegen": "♾️ CodeGen",
     "chatglm": "💬 ChatGLM",
     "llama": "🦙 LLaMA",
     "rwkv": "🐦‍⬛ RWKV",
     "mpt": "🧱 MPT",
+    "Yi": "🫂 Yi 人" , # people
     # suggest something
     "gpt_neox": "GPT-NeoX",
     "gpt_neo": "GPT-Neo",
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def process_arch(model_arch):
     if model_arch in LLM_MODEL_ARCHS:
         return LLM_MODEL_ARCHS[model_arch]
     else:
         return model_arch
+def process_score(score, quantization):
+    if quantization != "None":
+        return f"{score:.2f}*"
+    else:
+        return f"{score:.2f} "
+# def change_tab(query_param):
+#     query_param = query_param.replace("'", '"')
+#     query_param = json.loads(query_param)
+#     if isinstance(query_param, dict) and "tab" in query_param and query_param["tab"] == "plot":
+#         return gr.Tabs.update(selected=1)
+#     else:
+#         return gr.Tabs.update(selected=0)