|
from contextlib import ExitStack |
|
from dataclasses import dataclass |
|
from typing import List |
|
|
|
import click |
|
import gradio as gr |
|
import pandas as pd |
|
|
|
from parse_results import build_results |
|
|
|
|
|
@dataclass |
|
class PlotConfig: |
|
x_title: str |
|
y_title: str |
|
title: str |
|
percentiles: List[float] = None |
|
|
|
|
|
def run(from_results_dir, datasource, port): |
|
css = ''' |
|
.summary span { |
|
font-size: 10px; |
|
padding-top:0; |
|
padding-bottom:0; |
|
} |
|
''' |
|
|
|
summary_desc = ''' |
|
## Summary |
|
This table shows the average of the metrics for each model and QPS rate. |
|
|
|
The metrics are: |
|
* Inter token latency: Time to generate a new output token for each user querying the system. |
|
It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms |
|
* Time to First Token: Time the user has to wait before seeing the first token of its answer. |
|
Lower waiting time are essential for real-time interactions, less so for offline workloads. |
|
* End-to-end latency: The overall time the system took to generate the full response to the user. |
|
* Throughput: The number of tokens per second the system can generate across all requests |
|
* Successful requests: The number of requests the system was able to honor in the benchmark timeframe |
|
* Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them. |
|
|
|
''' |
|
|
|
df_bench = pd.DataFrame() |
|
line_plots_bench = [] |
|
column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)', |
|
'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)', |
|
'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model', |
|
'rate': 'QPS', 'run_id': 'Run ID'} |
|
default_df = pd.DataFrame.from_dict( |
|
{"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20], |
|
"version": ["default", "default"], |
|
"model": ["default", "default"]}) |
|
|
|
def load_demo(model_bench, percentiles): |
|
return update_bench(model_bench, percentiles) |
|
|
|
def update_bench(model, percentiles): |
|
res = [] |
|
for plot in line_plots_bench: |
|
if plot['config'].percentiles: |
|
k = plot['metric'] + '_' + str(percentiles) |
|
df_bench[plot['metric']] = df_bench[k] if k in df_bench.columns else 0 |
|
res.append(df_bench[(df_bench['model'] == model)]) |
|
|
|
return res + [summary_table()] |
|
|
|
def summary_table() -> pd.DataFrame: |
|
data = df_bench.groupby(['model', 'run_id', 'rate']).agg( |
|
{'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean', |
|
'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean', |
|
'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index() |
|
data = data[ |
|
['run_id', 'model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', |
|
'e2e_latency_ms_p90', |
|
'token_throughput_secs']] |
|
|
|
|
|
|
|
data = data.rename( |
|
columns=column_mappings) |
|
|
|
|
|
|
|
|
|
|
|
numeric_cols_to_ensure = [ |
|
'ITL P90 (ms)', 'TTFT P90 (ms)', 'E2E P90 (ms)', |
|
'Throughput (tokens/s)', 'QPS' |
|
] |
|
|
|
for col in numeric_cols_to_ensure: |
|
if col in data.columns: |
|
data[col] = pd.to_numeric(data[col], errors='coerce') |
|
|
|
|
|
rounding_rules = { |
|
'ITL P90 (ms)': 2, |
|
'TTFT P90 (ms)': 2, |
|
'E2E P90 (ms)': 2, |
|
'Throughput (tokens/s)': 2, |
|
'QPS': 0 |
|
} |
|
|
|
for col, decimals in rounding_rules.items(): |
|
if col in data.columns: |
|
data[col] = data[col].round(decimals) |
|
|
|
return data |
|
|
|
def load_bench_results(source) -> pd.DataFrame: |
|
data = pd.read_parquet(source) |
|
|
|
data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')] |
|
|
|
data = data[data['executor_type'] == 'ConstantArrivalRate'] |
|
return data |
|
|
|
def select_region(selection: gr.SelectData, model): |
|
min_w, max_w = selection.index |
|
data = df_bench[(df_bench['model'] == model) & (df_bench['rate'] >= min_w) & ( |
|
df_bench['rate'] <= max_w)] |
|
res = [] |
|
for plot in line_plots_bench: |
|
|
|
metric = plot["metric"] |
|
y_min = data[metric].min() |
|
y_max = data[metric].max() |
|
res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max])) |
|
return res |
|
|
|
def reset_region(): |
|
res = [] |
|
for _ in line_plots_bench: |
|
res.append(gr.LinePlot(x_lim=None, y_lim=None)) |
|
return res |
|
|
|
def load_datasource(datasource, fn): |
|
if datasource.startswith('file://'): |
|
return fn(datasource) |
|
elif datasource.startswith('s3://'): |
|
return fn(datasource) |
|
else: |
|
raise ValueError(f"Unknown datasource: {datasource}") |
|
|
|
if from_results_dir is not None: |
|
build_results(from_results_dir, 'benchmarks.parquet', None) |
|
|
|
df_bench = load_datasource(datasource, load_bench_results) |
|
|
|
|
|
metrics = { |
|
"inter_token_latency_ms": PlotConfig(title="Inter Token Latency (lower is better)", x_title="QPS", |
|
y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), |
|
"time_to_first_token_ms": PlotConfig(title="TTFT (lower is better)", x_title="QPS", |
|
y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), |
|
"e2e_latency_ms": PlotConfig(title="End to End Latency (lower is better)", x_title="QPS", |
|
y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), |
|
"token_throughput_secs": PlotConfig(title="Request Output Throughput (higher is better)", x_title="QPS", |
|
y_title="Tokens/s"), |
|
"successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS", |
|
y_title="Count"), |
|
"error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%"), |
|
"prompt_tokens": PlotConfig(title="Prompt tokens", x_title="QPS", y_title="Count"), |
|
"decoded_tokens": PlotConfig(title="Decoded tokens", x_title="QPS", y_title="Count") |
|
} |
|
|
|
models = df_bench["model"].unique() |
|
run_ids = df_bench["run_id"].unique() |
|
|
|
|
|
percentiles = set() |
|
for k, v in metrics.items(): |
|
if v.percentiles: |
|
percentiles.update(v.percentiles) |
|
percentiles = map(lambda p: f'p{int(float(p) * 100)}', percentiles) |
|
percentiles = sorted(list(percentiles)) |
|
percentiles.append('avg') |
|
with gr.Blocks(css=css, title="Inference Benchmarker") as demo: |
|
with gr.Row(): |
|
gr.Markdown("# Inference-benchmarker 🤗\n## Benchmarks results") |
|
with gr.Row(): |
|
gr.Markdown(summary_desc) |
|
with gr.Row(): |
|
table = gr.DataFrame( |
|
pd.DataFrame(), |
|
elem_classes=["summary"], |
|
) |
|
with gr.Row(): |
|
details_desc = gr.Markdown("## Details") |
|
with gr.Row(): |
|
model = gr.Dropdown(list(models), label="Select model", value=models[0]) |
|
with gr.Row(): |
|
percentiles_bench = gr.Radio(percentiles, label="", value="avg") |
|
i = 0 |
|
with ExitStack() as stack: |
|
for k, v in metrics.items(): |
|
if i % 2 == 0: |
|
stack.close() |
|
gs = stack.enter_context(gr.Row()) |
|
line_plots_bench.append( |
|
{"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k, |
|
y_title=v.y_title, x_title=v.x_title, |
|
color="run_id" |
|
), |
|
"model": model.value, |
|
"metric": k, |
|
"config": v |
|
}, |
|
) |
|
i += 1 |
|
|
|
for component in [model, percentiles_bench]: |
|
component.change(update_bench, [model, percentiles_bench], |
|
[item["component"] for item in line_plots_bench] + [table]) |
|
gr.on([plot["component"].select for plot in line_plots_bench], select_region, [model], |
|
outputs=[item["component"] for item in line_plots_bench]) |
|
gr.on([plot["component"].double_click for plot in line_plots_bench], reset_region, None, |
|
outputs=[item["component"] for item in line_plots_bench]) |
|
demo.load(load_demo, [model, percentiles_bench], |
|
[item["component"] for item in line_plots_bench] + [table]) |
|
|
|
demo.launch(server_port=port, server_name="0.0.0.0") |
|
|
|
|
|
@click.command() |
|
@click.option('--from-results-dir', default=None, help='Load inference-benchmarker results from a directory') |
|
@click.option('--datasource', default='file://benchmarks.parquet', help='Load a Parquet file already generated') |
|
@click.option('--port', default=7860, help='Port to run the dashboard') |
|
def main(from_results_dir, datasource, port): |
|
run(from_results_dir, datasource, port) |
|
|
|
|
|
if __name__ == '__main__': |
|
main(auto_envvar_prefix='DASHBOARD') |
|
|