loghugging25 commited on
Commit
f7cde70
·
1 Parent(s): 1e69227

initial commit

Browse files
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import ExitStack
2
+ from dataclasses import dataclass
3
+ from typing import List
4
+
5
+ import click
6
+ import gradio as gr
7
+ import pandas as pd
8
+
9
+ from parse_results import build_results
10
+
11
+
12
+ @dataclass
13
+ class PlotConfig:
14
+ x_title: str
15
+ y_title: str
16
+ title: str
17
+ percentiles: List[float] = None
18
+
19
+
20
+ def run(from_results_dir, datasource, port):
21
+ css = '''
22
+ .summary span {
23
+ font-size: 10px;
24
+ padding-top:0;
25
+ padding-bottom:0;
26
+ }
27
+ '''
28
+
29
+ summary_desc = '''
30
+ ## Summary
31
+ This table shows the average of the metrics for each model and QPS rate.
32
+
33
+ The metrics are:
34
+ * Inter token latency: Time to generate a new output token for each user querying the system.
35
+ It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms
36
+ * Time to First Token: Time the user has to wait before seeing the first token of its answer.
37
+ Lower waiting time are essential for real-time interactions, less so for offline workloads.
38
+ * End-to-end latency: The overall time the system took to generate the full response to the user.
39
+ * Throughput: The number of tokens per second the system can generate across all requests
40
+ * Successful requests: The number of requests the system was able to honor in the benchmark timeframe
41
+ * Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them.
42
+
43
+ '''
44
+
45
+ df_bench = pd.DataFrame()
46
+ line_plots_bench = []
47
+ column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)',
48
+ 'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)',
49
+ 'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model',
50
+ 'rate': 'QPS', 'run_id': 'Run ID'}
51
+ default_df = pd.DataFrame.from_dict(
52
+ {"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20],
53
+ "version": ["default", "default"],
54
+ "model": ["default", "default"]})
55
+
56
+ def load_demo(model_bench, percentiles):
57
+ return update_bench(model_bench, percentiles)
58
+
59
+ def update_bench(model, percentiles):
60
+ res = []
61
+ for plot in line_plots_bench:
62
+ if plot['config'].percentiles:
63
+ k = plot['metric'] + '_' + str(percentiles)
64
+ df_bench[plot['metric']] = df_bench[k] if k in df_bench.columns else 0
65
+ res.append(df_bench[(df_bench['model'] == model)])
66
+
67
+ return res + [summary_table()]
68
+
69
+ def summary_table() -> pd.DataFrame:
70
+ data = df_bench.groupby(['model', 'run_id', 'rate']).agg(
71
+ {'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean',
72
+ 'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean',
73
+ 'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index()
74
+ data = data[
75
+ ['run_id', 'model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90',
76
+ 'e2e_latency_ms_p90',
77
+ 'token_throughput_secs']]
78
+ for metric in ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
79
+ 'token_throughput_secs']:
80
+ data[metric] = data[metric].apply(lambda x: f"{x:.2f}")
81
+ data = data.rename(
82
+ columns=column_mappings)
83
+ return data
84
+
85
+ def load_bench_results(source) -> pd.DataFrame:
86
+ data = pd.read_parquet(source)
87
+ # remove warmup and throughput
88
+ data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')]
89
+ # only keep constant rate
90
+ data = data[data['executor_type'] == 'ConstantArrivalRate']
91
+ return data
92
+
93
+ def select_region(selection: gr.SelectData, model):
94
+ min_w, max_w = selection.index
95
+ data = df_bench[(df_bench['model'] == model) & (df_bench['rate'] >= min_w) & (
96
+ df_bench['rate'] <= max_w)]
97
+ res = []
98
+ for plot in line_plots_bench:
99
+ # find the y values for the selected region
100
+ metric = plot["metric"]
101
+ y_min = data[metric].min()
102
+ y_max = data[metric].max()
103
+ res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max]))
104
+ return res
105
+
106
+ def reset_region():
107
+ res = []
108
+ for _ in line_plots_bench:
109
+ res.append(gr.LinePlot(x_lim=None, y_lim=None))
110
+ return res
111
+
112
+ def load_datasource(datasource, fn):
113
+ if datasource.startswith('file://'):
114
+ return fn(datasource)
115
+ elif datasource.startswith('s3://'):
116
+ return fn(datasource)
117
+ else:
118
+ raise ValueError(f"Unknown datasource: {datasource}")
119
+
120
+ if from_results_dir is not None:
121
+ build_results(from_results_dir, 'benchmarks.parquet', None)
122
+ # Load data
123
+ df_bench = load_datasource(datasource, load_bench_results)
124
+
125
+ # Define metrics
126
+ metrics = {
127
+ "inter_token_latency_ms": PlotConfig(title="Inter Token Latency (lower is better)", x_title="QPS",
128
+ y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
129
+ "time_to_first_token_ms": PlotConfig(title="TTFT (lower is better)", x_title="QPS",
130
+ y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
131
+ "e2e_latency_ms": PlotConfig(title="End to End Latency (lower is better)", x_title="QPS",
132
+ y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
133
+ "token_throughput_secs": PlotConfig(title="Request Output Throughput (higher is better)", x_title="QPS",
134
+ y_title="Tokens/s"),
135
+ "successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS",
136
+ y_title="Count"),
137
+ "error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%"),
138
+ "prompt_tokens": PlotConfig(title="Prompt tokens", x_title="QPS", y_title="Count"),
139
+ "decoded_tokens": PlotConfig(title="Decoded tokens", x_title="QPS", y_title="Count")
140
+ }
141
+
142
+ models = df_bench["model"].unique()
143
+ run_ids = df_bench["run_id"].unique()
144
+
145
+ # get all available percentiles
146
+ percentiles = set()
147
+ for k, v in metrics.items():
148
+ if v.percentiles:
149
+ percentiles.update(v.percentiles)
150
+ percentiles = map(lambda p: f'p{int(float(p) * 100)}', percentiles)
151
+ percentiles = sorted(list(percentiles))
152
+ percentiles.append('avg')
153
+ with gr.Blocks(css=css, title="Inference Benchmarker") as demo:
154
+ with gr.Row():
155
+ gr.Markdown("# Inference-benchmarker 🤗\n## Benchmarks results")
156
+ with gr.Row():
157
+ gr.Markdown(summary_desc)
158
+ with gr.Row():
159
+ table = gr.DataFrame(
160
+ pd.DataFrame(),
161
+ elem_classes=["summary"],
162
+ )
163
+ with gr.Row():
164
+ details_desc = gr.Markdown("## Details")
165
+ with gr.Row():
166
+ model = gr.Dropdown(list(models), label="Select model", value=models[0])
167
+ with gr.Row():
168
+ percentiles_bench = gr.Radio(percentiles, label="", value="avg")
169
+ i = 0
170
+ with ExitStack() as stack:
171
+ for k, v in metrics.items():
172
+ if i % 2 == 0:
173
+ stack.close()
174
+ gs = stack.enter_context(gr.Row())
175
+ line_plots_bench.append(
176
+ {"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k,
177
+ y_title=v.y_title, x_title=v.x_title,
178
+ color="run_id"
179
+ ),
180
+ "model": model.value,
181
+ "metric": k,
182
+ "config": v
183
+ },
184
+ )
185
+ i += 1
186
+
187
+ for component in [model, percentiles_bench]:
188
+ component.change(update_bench, [model, percentiles_bench],
189
+ [item["component"] for item in line_plots_bench] + [table])
190
+ gr.on([plot["component"].select for plot in line_plots_bench], select_region, [model],
191
+ outputs=[item["component"] for item in line_plots_bench])
192
+ gr.on([plot["component"].double_click for plot in line_plots_bench], reset_region, None,
193
+ outputs=[item["component"] for item in line_plots_bench])
194
+ demo.load(load_demo, [model, percentiles_bench],
195
+ [item["component"] for item in line_plots_bench] + [table])
196
+
197
+ demo.launch(server_port=port, server_name="0.0.0.0")
198
+
199
+
200
+ @click.command()
201
+ @click.option('--from-results-dir', default=None, help='Load inference-benchmarker results from a directory')
202
+ @click.option('--datasource', default='file://benchmarks.parquet', help='Load a Parquet file already generated')
203
+ @click.option('--port', default=7860, help='Port to run the dashboard')
204
+ def main(from_results_dir, datasource, port):
205
+ run(from_results_dir, datasource, port)
206
+
207
+
208
+ if __name__ == '__main__':
209
+ main(auto_envvar_prefix='DASHBOARD')
parse_results.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import pandas as pd
6
+
7
+
8
+ def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
9
+ df = pd.DataFrame()
10
+ # Load the results
11
+ for key, filename in data_files.items():
12
+ with open(filename, 'r') as f:
13
+ data = json.load(f)
14
+ if data['config']['meta'] is None:
15
+ data['config']['meta'] = {}
16
+ for result in data['results']:
17
+ entry = pd.json_normalize(result).to_dict(orient='records')[0]
18
+ if 'engine' in data['config']['meta']:
19
+ entry['engine'] = data['config']['meta']['engine']
20
+ if 'tp' in data['config']['meta']:
21
+ entry['tp'] = data['config']['meta']['tp']
22
+ if 'version' in data['config']['meta']:
23
+ entry['version'] = data['config']['meta']['version']
24
+ if 'device' in data['config']['meta']:
25
+ entry['device'] = data['config']['meta']['device']
26
+ entry['model'] = data['config']['model_name']
27
+ entry['run_id'] = data['config']['run_id']
28
+ df_tmp = pd.DataFrame(entry, index=[0])
29
+ # rename columns that start with 'config.'
30
+ df_tmp = df_tmp.rename(columns={c: c.split('config.')[-1] for c in df_tmp.columns})
31
+ # replace . with _ in column names
32
+ df_tmp.columns = [c.replace('.', '_') for c in df_tmp.columns]
33
+
34
+ df = pd.concat([df, df_tmp])
35
+ return df
36
+
37
+
38
+ def build_results_df(results_dir) -> pd.DataFrame:
39
+ df = pd.DataFrame()
40
+ # list directories
41
+ directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')] + [results_dir]
42
+ for directory in directories:
43
+ # list json files in results directory
44
+ data_files = {}
45
+ for filename in os.listdir(directory):
46
+ if filename.endswith('.json'):
47
+ data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
48
+ df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
49
+ return df
50
+
51
+
52
+ def build_results(results_dir, results_file, device):
53
+ df = build_results_df(results_dir)
54
+ if 'device' not in df.columns:
55
+ df['device'] = df['model'].apply(lambda x: device)
56
+ df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
57
+ df['prompt_tokens'] = df['total_tokens_sent'] / df['successful_requests']
58
+ df['decoded_tokens'] = df['total_tokens'] / df['successful_requests']
59
+ df.to_parquet(results_file)
60
+
61
+
62
+ if __name__ == '__main__':
63
+ parser = argparse.ArgumentParser()
64
+ parser.add_argument('--results-dir', default='results', type=str, required=True,
65
+ help='Path to the source directory containing the results')
66
+ parser.add_argument('--results-file', type=str, required=True,
67
+ help='Path to the results file to write to. Can be a S3 path')
68
+ parser.add_argument('--device', type=str, required=True, help='GPU name used for benchmarking')
69
+ args = parser.parse_args()
70
+ build_results(args.results_dir, args.results_file, args.device)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==5.23.1
2
+ pandas==2.2.3
3
+ numpy==2.2.4
4
+ matplotlib==3.10.1
5
+ python-dateutil==2.9.0
6
+ pyyaml==6.0.2
7
+ fastapi==0.115.12
8
+ uvicorn==0.34.0
9
+ aiofiles==23.2.1
10
+ orjson==3.10.16
11
+ typing-extensions==4.13.0
12
+ anyio==4.9.0
results/RedHatAI_phi-4-FP8-dynamic_2025-05-21-09-15-05.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "max_vus": 800,
4
+ "duration_secs": 120,
5
+ "benchmark_kind": "Rate",
6
+ "warmup_duration_secs": 30,
7
+ "rates": [
8
+ 1.0,
9
+ 10.0,
10
+ 30.0,
11
+ 100.0
12
+ ],
13
+ "num_rates": 10,
14
+ "prompt_options": {
15
+ "num_tokens": 200,
16
+ "min_tokens": 180,
17
+ "max_tokens": 220,
18
+ "variance": 10
19
+ },
20
+ "decode_options": {
21
+ "num_tokens": 200,
22
+ "min_tokens": 180,
23
+ "max_tokens": 220,
24
+ "variance": 10
25
+ },
26
+ "tokenizer": "RedHatAI/phi-4-FP8-dynamic",
27
+ "model_name": "phi-4",
28
+ "profile": null,
29
+ "meta": null,
30
+ "run_id": "vLLM: RedHatAI/phi-4-FP8-dynamic (200 tokens)"
31
+ },
32
+ "results": [
33
+ {
34
+ "id": "warmup",
35
+ "executor_type": "ConstantVUs",
36
+ "config": {
37
+ "max_vus": 1,
38
+ "duration_secs": 30,
39
+ "rate": null
40
+ },
41
+ "total_requests": 7,
42
+ "total_tokens": 1401,
43
+ "token_throughput_secs": 41.207311909734074,
44
+ "duration_ms": 33998,
45
+ "time_to_first_token_ms": {
46
+ "p50": 30.74,
47
+ "p60": 30.848,
48
+ "p70": 31.032,
49
+ "p80": 31.367,
50
+ "p90": 600.369,
51
+ "p95": 1027.036,
52
+ "p99": 1368.37,
53
+ "avg": 233.964
54
+ },
55
+ "inter_token_latency_ms": {
56
+ "p50": 23.217,
57
+ "p60": 23.222,
58
+ "p70": 23.228,
59
+ "p80": 23.236,
60
+ "p90": 23.248,
61
+ "p95": 23.254,
62
+ "p99": 23.26,
63
+ "avg": 23.213
64
+ },
65
+ "failed_requests": 0,
66
+ "successful_requests": 7,
67
+ "request_rate": 0.2058894956232252,
68
+ "total_tokens_sent": 1400,
69
+ "e2e_latency_ms": {
70
+ "p50": 4743.409,
71
+ "p60": 4751.971,
72
+ "p70": 4775.205,
73
+ "p80": 4827.785,
74
+ "p90": 5318.839,
75
+ "p95": 5673.985,
76
+ "p99": 5958.102,
77
+ "avg": 4856.823
78
+ }
79
+ },
80
+ {
81
+ "id": "[email protected]/s",
82
+ "executor_type": "ConstantArrivalRate",
83
+ "config": {
84
+ "max_vus": 800,
85
+ "duration_secs": 120,
86
+ "rate": 1.0
87
+ },
88
+ "total_requests": 115,
89
+ "total_tokens": 22163,
90
+ "token_throughput_secs": 186.64991064360598,
91
+ "duration_ms": 118741,
92
+ "time_to_first_token_ms": {
93
+ "p50": 43.445,
94
+ "p60": 45.341,
95
+ "p70": 47.407,
96
+ "p80": 50.324,
97
+ "p90": 53.509,
98
+ "p95": 54.94,
99
+ "p99": 57.022,
100
+ "avg": 43.314
101
+ },
102
+ "inter_token_latency_ms": {
103
+ "p50": 24.082,
104
+ "p60": 24.1,
105
+ "p70": 24.124,
106
+ "p80": 24.146,
107
+ "p90": 24.21,
108
+ "p95": 24.288,
109
+ "p99": 24.376,
110
+ "avg": 24.09
111
+ },
112
+ "failed_requests": 0,
113
+ "successful_requests": 115,
114
+ "request_rate": 0.9684943249566704,
115
+ "total_tokens_sent": 23000,
116
+ "e2e_latency_ms": {
117
+ "p50": 4814.201,
118
+ "p60": 4873.26,
119
+ "p70": 4947.365,
120
+ "p80": 5011.934,
121
+ "p90": 5104.903,
122
+ "p95": 5182.844,
123
+ "p99": 5309.301,
124
+ "avg": 4665.197
125
+ }
126
+ },
127
+ {
128
+ "id": "[email protected]/s",
129
+ "executor_type": "ConstantArrivalRate",
130
+ "config": {
131
+ "max_vus": 800,
132
+ "duration_secs": 120,
133
+ "rate": 10.0
134
+ },
135
+ "total_requests": 1149,
136
+ "total_tokens": 217686,
137
+ "token_throughput_secs": 1837.4411468828155,
138
+ "duration_ms": 118472,
139
+ "time_to_first_token_ms": {
140
+ "p50": 55.249,
141
+ "p60": 57.796,
142
+ "p70": 60.296,
143
+ "p80": 63.162,
144
+ "p90": 66.14,
145
+ "p95": 67.799,
146
+ "p99": 70.85,
147
+ "avg": 55.52
148
+ },
149
+ "inter_token_latency_ms": {
150
+ "p50": 28.914,
151
+ "p60": 28.973,
152
+ "p70": 29.029,
153
+ "p80": 29.089,
154
+ "p90": 29.168,
155
+ "p95": 29.211,
156
+ "p99": 29.331,
157
+ "avg": 28.737
158
+ },
159
+ "failed_requests": 0,
160
+ "successful_requests": 1149,
161
+ "request_rate": 9.698464199665366,
162
+ "total_tokens_sent": 229800,
163
+ "e2e_latency_ms": {
164
+ "p50": 5707.118,
165
+ "p60": 5793.95,
166
+ "p70": 5885.254,
167
+ "p80": 5983.201,
168
+ "p90": 6126.889,
169
+ "p95": 6219.476,
170
+ "p99": 6386.803,
171
+ "avg": 5477.946
172
+ }
173
+ },
174
+ {
175
+ "id": "[email protected]/s",
176
+ "executor_type": "ConstantArrivalRate",
177
+ "config": {
178
+ "max_vus": 800,
179
+ "duration_secs": 120,
180
+ "rate": 30.0
181
+ },
182
+ "total_requests": 1889,
183
+ "total_tokens": 348708,
184
+ "token_throughput_secs": 2911.7479692043544,
185
+ "duration_ms": 119758,
186
+ "time_to_first_token_ms": {
187
+ "p50": 22192.744,
188
+ "p60": 26837.194,
189
+ "p70": 29205.612,
190
+ "p80": 33069.312,
191
+ "p90": 35968.562,
192
+ "p95": 36825.858,
193
+ "p99": 37298.867,
194
+ "avg": 19829.052
195
+ },
196
+ "inter_token_latency_ms": {
197
+ "p50": 64.987,
198
+ "p60": 66.093,
199
+ "p70": 67.344,
200
+ "p80": 72.108,
201
+ "p90": 90.713,
202
+ "p95": 98.38,
203
+ "p99": 177.348,
204
+ "avg": 69.926
205
+ },
206
+ "failed_requests": 0,
207
+ "successful_requests": 1889,
208
+ "request_rate": 15.77334593363796,
209
+ "total_tokens_sent": 377800,
210
+ "e2e_latency_ms": {
211
+ "p50": 33837.749,
212
+ "p60": 38364.805,
213
+ "p70": 42612.972,
214
+ "p80": 45779.935,
215
+ "p90": 48249.655,
216
+ "p95": 49268.594,
217
+ "p99": 50884.661,
218
+ "avg": 32263.266
219
+ }
220
+ },
221
+ {
222
+ "id": "[email protected]/s",
223
+ "executor_type": "ConstantArrivalRate",
224
+ "config": {
225
+ "max_vus": 800,
226
+ "duration_secs": 120,
227
+ "rate": 100.0
228
+ },
229
+ "total_requests": 1923,
230
+ "total_tokens": 355495,
231
+ "token_throughput_secs": 2963.3510051149824,
232
+ "duration_ms": 119963,
233
+ "time_to_first_token_ms": {
234
+ "p50": 30849.07,
235
+ "p60": 32647.17,
236
+ "p70": 35695.762,
237
+ "p80": 36657.309,
238
+ "p90": 37063.893,
239
+ "p95": 37265.804,
240
+ "p99": 37693.244,
241
+ "avg": 25983.203
242
+ },
243
+ "inter_token_latency_ms": {
244
+ "p50": 64.756,
245
+ "p60": 66.434,
246
+ "p70": 68.803,
247
+ "p80": 83.204,
248
+ "p90": 96.295,
249
+ "p95": 103.874,
250
+ "p99": 163.895,
251
+ "avg": 73.033
252
+ },
253
+ "failed_requests": 0,
254
+ "successful_requests": 1923,
255
+ "request_rate": 16.02982878194099,
256
+ "total_tokens_sent": 384600,
257
+ "e2e_latency_ms": {
258
+ "p50": 44432.763,
259
+ "p60": 46273.082,
260
+ "p70": 47729.904,
261
+ "p80": 48714.768,
262
+ "p90": 49917.33,
263
+ "p95": 50686.527,
264
+ "p99": 51992.951,
265
+ "avg": 38685.294
266
+ }
267
+ }
268
+ ],
269
+ "start_time": "2025-05-21T09:04:59.479961191+00:00",
270
+ "end_time": "2025-05-21T09:15:05.115323148+00:00",
271
+ "system": {
272
+ "cpu": [
273
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu0@4699MHz",
274
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu1@4699MHz",
275
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu2@4699MHz",
276
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu3@4699MHz",
277
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu4@4699MHz",
278
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu5@4699MHz",
279
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu6@4699MHz",
280
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu7@4699MHz",
281
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu8@4699MHz",
282
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu9@4699MHz",
283
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu10@4699MHz",
284
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu11@4699MHz",
285
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu12@4699MHz",
286
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu13@4699MHz",
287
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu14@4699MHz",
288
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu15@4699MHz"
289
+ ],
290
+ "memory": "83.47 GB",
291
+ "os_name": "Debian GNU/Linux",
292
+ "os_version": "11",
293
+ "kernel": "5.15.167.4-microsoft-standard-WSL2",
294
+ "hostname": "computer"
295
+ }
296
+ }
results/RedHatAI_phi-4-FP8-dynamic_2025-05-21-13-56-47.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "max_vus": 800,
4
+ "duration_secs": 120,
5
+ "benchmark_kind": "Rate",
6
+ "warmup_duration_secs": 30,
7
+ "rates": [
8
+ 1.0,
9
+ 10.0,
10
+ 30.0,
11
+ 100.0
12
+ ],
13
+ "num_rates": 10,
14
+ "prompt_options": {
15
+ "num_tokens": 8000,
16
+ "min_tokens": 7980,
17
+ "max_tokens": 8020,
18
+ "variance": 10
19
+ },
20
+ "decode_options": {
21
+ "num_tokens": 8000,
22
+ "min_tokens": 7980,
23
+ "max_tokens": 8020,
24
+ "variance": 10
25
+ },
26
+ "tokenizer": "RedHatAI/phi-4-FP8-dynamic",
27
+ "model_name": "phi-4",
28
+ "profile": null,
29
+ "meta": null,
30
+ "run_id": "vLLM: RedHatAI/phi-4-FP8-dynamic (8000 tokens)"
31
+ },
32
+ "results": [
33
+ {
34
+ "id": "warmup",
35
+ "executor_type": "ConstantVUs",
36
+ "config": {
37
+ "max_vus": 1,
38
+ "duration_secs": 30,
39
+ "rate": null
40
+ },
41
+ "total_requests": 2,
42
+ "total_tokens": 1643,
43
+ "token_throughput_secs": 38.490013255851395,
44
+ "duration_ms": 42686,
45
+ "time_to_first_token_ms": {
46
+ "p50": 1276.801,
47
+ "p60": 1388.913,
48
+ "p70": 1501.026,
49
+ "p80": 1613.139,
50
+ "p90": 1725.252,
51
+ "p95": 1781.309,
52
+ "p99": 1826.154,
53
+ "avg": 1276.801
54
+ },
55
+ "inter_token_latency_ms": {
56
+ "p50": 24.424,
57
+ "p60": 24.432,
58
+ "p70": 24.44,
59
+ "p80": 24.448,
60
+ "p90": 24.456,
61
+ "p95": 24.46,
62
+ "p99": 24.463,
63
+ "avg": 24.424
64
+ },
65
+ "failed_requests": 0,
66
+ "successful_requests": 2,
67
+ "request_rate": 0.0468533332390157,
68
+ "total_tokens_sent": 16000,
69
+ "e2e_latency_ms": {
70
+ "p50": 21343.075,
71
+ "p60": 21391.438,
72
+ "p70": 21439.801,
73
+ "p80": 21488.164,
74
+ "p90": 21536.527,
75
+ "p95": 21560.709,
76
+ "p99": 21580.054,
77
+ "avg": 21343.075
78
+ }
79
+ },
80
+ {
81
+ "id": "[email protected]/s",
82
+ "executor_type": "ConstantArrivalRate",
83
+ "config": {
84
+ "max_vus": 800,
85
+ "duration_secs": 120,
86
+ "rate": 1.0
87
+ },
88
+ "total_requests": 90,
89
+ "total_tokens": 55892,
90
+ "token_throughput_secs": 478.696852515677,
91
+ "duration_ms": 116758,
92
+ "time_to_first_token_ms": {
93
+ "p50": 118.856,
94
+ "p60": 124.707,
95
+ "p70": 131.654,
96
+ "p80": 135.562,
97
+ "p90": 145.529,
98
+ "p95": 150.366,
99
+ "p99": 715.649,
100
+ "avg": 128.611
101
+ },
102
+ "inter_token_latency_ms": {
103
+ "p50": 45.758,
104
+ "p60": 46.229,
105
+ "p70": 46.314,
106
+ "p80": 46.373,
107
+ "p90": 46.483,
108
+ "p95": 46.581,
109
+ "p99": 46.871,
110
+ "avg": 43.271
111
+ },
112
+ "failed_requests": 0,
113
+ "successful_requests": 90,
114
+ "request_rate": 0.7708208102485317,
115
+ "total_tokens_sent": 720000,
116
+ "e2e_latency_ms": {
117
+ "p50": 27887.256,
118
+ "p60": 30188.411,
119
+ "p70": 31661.903,
120
+ "p80": 35685.812,
121
+ "p90": 45661.636,
122
+ "p95": 50093.628,
123
+ "p99": 59727.184,
124
+ "avg": 27093.895
125
+ }
126
+ },
127
+ {
128
+ "id": "[email protected]/s",
129
+ "executor_type": "ConstantArrivalRate",
130
+ "config": {
131
+ "max_vus": 800,
132
+ "duration_secs": 120,
133
+ "rate": 10.0
134
+ },
135
+ "total_requests": 97,
136
+ "total_tokens": 45779,
137
+ "token_throughput_secs": 385.8671945353039,
138
+ "duration_ms": 118639,
139
+ "time_to_first_token_ms": {
140
+ "p50": 264.625,
141
+ "p60": 314.639,
142
+ "p70": 341.786,
143
+ "p80": 416.021,
144
+ "p90": 502.604,
145
+ "p95": 608.336,
146
+ "p99": 712.908,
147
+ "avg": 278.878
148
+ },
149
+ "inter_token_latency_ms": {
150
+ "p50": 152.068,
151
+ "p60": 183.639,
152
+ "p70": 208.294,
153
+ "p80": 210.057,
154
+ "p90": 211.894,
155
+ "p95": 421.244,
156
+ "p99": 436.578,
157
+ "avg": 190.502
158
+ },
159
+ "failed_requests": 0,
160
+ "successful_requests": 97,
161
+ "request_rate": 0.8176045319890011,
162
+ "total_tokens_sent": 776000,
163
+ "e2e_latency_ms": {
164
+ "p50": 89809.719,
165
+ "p60": 90599.198,
166
+ "p70": 97086.861,
167
+ "p80": 97763.592,
168
+ "p90": 102705.608,
169
+ "p95": 105891.319,
170
+ "p99": 109209.372,
171
+ "avg": 80168.287
172
+ }
173
+ },
174
+ {
175
+ "id": "[email protected]/s",
176
+ "executor_type": "ConstantArrivalRate",
177
+ "config": {
178
+ "max_vus": 800,
179
+ "duration_secs": 120,
180
+ "rate": 30.0
181
+ },
182
+ "total_requests": 108,
183
+ "total_tokens": 48755,
184
+ "token_throughput_secs": 408.5182278415837,
185
+ "duration_ms": 119345,
186
+ "time_to_first_token_ms": {
187
+ "p50": 315.639,
188
+ "p60": 364.113,
189
+ "p70": 440.936,
190
+ "p80": 517.15,
191
+ "p90": 635.496,
192
+ "p95": 743.467,
193
+ "p99": 886.077,
194
+ "avg": 348.945
195
+ },
196
+ "inter_token_latency_ms": {
197
+ "p50": 172.827,
198
+ "p60": 189.057,
199
+ "p70": 196.538,
200
+ "p80": 201.266,
201
+ "p90": 442.975,
202
+ "p95": 465.991,
203
+ "p99": 473.842,
204
+ "avg": 207.845
205
+ },
206
+ "failed_requests": 0,
207
+ "successful_requests": 108,
208
+ "request_rate": 0.9049321835071489,
209
+ "total_tokens_sent": 864000,
210
+ "e2e_latency_ms": {
211
+ "p50": 89868.756,
212
+ "p60": 96902.23,
213
+ "p70": 98937.333,
214
+ "p80": 102789.849,
215
+ "p90": 109541.9,
216
+ "p95": 111388.456,
217
+ "p99": 114281.927,
218
+ "avg": 82072.638
219
+ }
220
+ },
221
+ {
222
+ "id": "[email protected]/s",
223
+ "executor_type": "ConstantArrivalRate",
224
+ "config": {
225
+ "max_vus": 800,
226
+ "duration_secs": 120,
227
+ "rate": 100.0
228
+ },
229
+ "total_requests": 125,
230
+ "total_tokens": 57918,
231
+ "token_throughput_secs": 485.359321343381,
232
+ "duration_ms": 119330,
233
+ "time_to_first_token_ms": {
234
+ "p50": 1154.434,
235
+ "p60": 1276.393,
236
+ "p70": 1440.368,
237
+ "p80": 1604.069,
238
+ "p90": 1768.54,
239
+ "p95": 1850.13,
240
+ "p99": 1919.678,
241
+ "avg": 1208.132
242
+ },
243
+ "inter_token_latency_ms": {
244
+ "p50": 166.875,
245
+ "p60": 166.884,
246
+ "p70": 167.245,
247
+ "p80": 188.28,
248
+ "p90": 350.172,
249
+ "p95": 417.485,
250
+ "p99": 437.566,
251
+ "avg": 186.06
252
+ },
253
+ "failed_requests": 0,
254
+ "successful_requests": 125,
255
+ "request_rate": 1.047513988188864,
256
+ "total_tokens_sent": 1000000,
257
+ "e2e_latency_ms": {
258
+ "p50": 82803.004,
259
+ "p60": 89976.229,
260
+ "p70": 90374.914,
261
+ "p80": 99727.225,
262
+ "p90": 108866.194,
263
+ "p95": 113444.528,
264
+ "p99": 116545.189,
265
+ "avg": 77917.015
266
+ }
267
+ }
268
+ ],
269
+ "start_time": "2025-05-21T13:41:44.260015742+00:00",
270
+ "end_time": "2025-05-21T13:56:47.150683889+00:00",
271
+ "system": {
272
+ "cpu": [
273
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu0@4699MHz",
274
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu1@4699MHz",
275
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu2@4699MHz",
276
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu3@4699MHz",
277
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu4@4699MHz",
278
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu5@4699MHz",
279
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu6@4699MHz",
280
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu7@4699MHz",
281
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu8@4699MHz",
282
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu9@4699MHz",
283
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu10@4699MHz",
284
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu11@4699MHz",
285
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu12@4699MHz",
286
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu13@4699MHz",
287
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu14@4699MHz",
288
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu15@4699MHz"
289
+ ],
290
+ "memory": "83.47 GB",
291
+ "os_name": "Debian GNU/Linux",
292
+ "os_version": "11",
293
+ "kernel": "5.15.167.4-microsoft-standard-WSL2",
294
+ "hostname": "computer"
295
+ }
296
+ }
results/microsoft_phi-4_2025-05-21-12-47-52.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "max_vus": 800,
4
+ "duration_secs": 120,
5
+ "benchmark_kind": "Rate",
6
+ "warmup_duration_secs": 30,
7
+ "rates": [
8
+ 1.0,
9
+ 10.0,
10
+ 30.0,
11
+ 100.0
12
+ ],
13
+ "num_rates": 10,
14
+ "prompt_options": {
15
+ "num_tokens": 200,
16
+ "min_tokens": 180,
17
+ "max_tokens": 220,
18
+ "variance": 10
19
+ },
20
+ "decode_options": {
21
+ "num_tokens": 200,
22
+ "min_tokens": 180,
23
+ "max_tokens": 220,
24
+ "variance": 10
25
+ },
26
+ "tokenizer": "microsoft/phi-4",
27
+ "model_name": "phi-4",
28
+ "profile": null,
29
+ "meta": null,
30
+ "run_id": "Ollama: unsloth/phi-4-GGUF:Q8_0 (200 tokens)"
31
+ },
32
+ "results": [
33
+ {
34
+ "id": "warmup",
35
+ "executor_type": "ConstantVUs",
36
+ "config": {
37
+ "max_vus": 1,
38
+ "duration_secs": 30,
39
+ "rate": null
40
+ },
41
+ "total_requests": 17,
42
+ "total_tokens": 2560,
43
+ "token_throughput_secs": 81.92346820970964,
44
+ "duration_ms": 31248,
45
+ "time_to_first_token_ms": {
46
+ "p50": 48.023,
47
+ "p60": 48.316,
48
+ "p70": 48.704,
49
+ "p80": 49.172,
50
+ "p90": 50.133,
51
+ "p95": 79.141,
52
+ "p99": 171.884,
53
+ "avg": 56.904
54
+ },
55
+ "inter_token_latency_ms": {
56
+ "p50": 11.835,
57
+ "p60": 11.849,
58
+ "p70": 11.866,
59
+ "p80": 11.888,
60
+ "p90": 11.999,
61
+ "p95": 12.031,
62
+ "p99": 12.057,
63
+ "avg": 11.863
64
+ },
65
+ "failed_requests": 0,
66
+ "successful_requests": 17,
67
+ "request_rate": 0.5440230310801031,
68
+ "total_tokens_sent": 3400,
69
+ "e2e_latency_ms": {
70
+ "p50": 2193.161,
71
+ "p60": 2256.189,
72
+ "p70": 2409.636,
73
+ "p80": 2503.287,
74
+ "p90": 2558.373,
75
+ "p95": 2565.267,
76
+ "p99": 2582.093,
77
+ "avg": 1837.986
78
+ }
79
+ },
80
+ {
81
+ "id": "[email protected]/s",
82
+ "executor_type": "ConstantArrivalRate",
83
+ "config": {
84
+ "max_vus": 800,
85
+ "duration_secs": 120,
86
+ "rate": 1.0
87
+ },
88
+ "total_requests": 68,
89
+ "total_tokens": 13393,
90
+ "token_throughput_secs": 113.50678834081126,
91
+ "duration_ms": 117992,
92
+ "time_to_first_token_ms": {
93
+ "p50": 23628.355,
94
+ "p60": 28364.866,
95
+ "p70": 33468.314,
96
+ "p80": 37116.28,
97
+ "p90": 42197.075,
98
+ "p95": 44792.584,
99
+ "p99": 46808.871,
100
+ "avg": 23527.531
101
+ },
102
+ "inter_token_latency_ms": {
103
+ "p50": 17.148,
104
+ "p60": 17.164,
105
+ "p70": 17.183,
106
+ "p80": 17.199,
107
+ "p90": 17.22,
108
+ "p95": 17.235,
109
+ "p99": 17.256,
110
+ "avg": 17.123
111
+ },
112
+ "failed_requests": 0,
113
+ "successful_requests": 68,
114
+ "request_rate": 0.5763056527421164,
115
+ "total_tokens_sent": 13600,
116
+ "e2e_latency_ms": {
117
+ "p50": 26918.292,
118
+ "p60": 31837.746,
119
+ "p70": 36426.629,
120
+ "p80": 40565.391,
121
+ "p90": 45507.834,
122
+ "p95": 48259.487,
123
+ "p99": 50280.92,
124
+ "avg": 26884.974
125
+ }
126
+ },
127
+ {
128
+ "id": "[email protected]/s",
129
+ "executor_type": "ConstantArrivalRate",
130
+ "config": {
131
+ "max_vus": 800,
132
+ "duration_secs": 120,
133
+ "rate": 10.0
134
+ },
135
+ "total_requests": 69,
136
+ "total_tokens": 13411,
137
+ "token_throughput_secs": 112.91469560470007,
138
+ "duration_ms": 118771,
139
+ "time_to_first_token_ms": {
140
+ "p50": 54889.419,
141
+ "p60": 66226.724,
142
+ "p70": 77657.43,
143
+ "p80": 87194.269,
144
+ "p90": 97361.153,
145
+ "p95": 102660.303,
146
+ "p99": 106894.626,
147
+ "avg": 54527.075
148
+ },
149
+ "inter_token_latency_ms": {
150
+ "p50": 17.284,
151
+ "p60": 17.295,
152
+ "p70": 17.305,
153
+ "p80": 17.328,
154
+ "p90": 17.385,
155
+ "p95": 17.394,
156
+ "p99": 17.447,
157
+ "avg": 17.279
158
+ },
159
+ "failed_requests": 0,
160
+ "successful_requests": 69,
161
+ "request_rate": 0.5809495188072705,
162
+ "total_tokens_sent": 13800,
163
+ "e2e_latency_ms": {
164
+ "p50": 58021.804,
165
+ "p60": 69751.13,
166
+ "p70": 80116.293,
167
+ "p80": 90587.03,
168
+ "p90": 100535.513,
169
+ "p95": 105903.68,
170
+ "p99": 110535.65,
171
+ "avg": 57868.946
172
+ }
173
+ },
174
+ {
175
+ "id": "[email protected]/s",
176
+ "executor_type": "ConstantArrivalRate",
177
+ "config": {
178
+ "max_vus": 800,
179
+ "duration_secs": 120,
180
+ "rate": 30.0
181
+ },
182
+ "total_requests": 70,
183
+ "total_tokens": 13581,
184
+ "token_throughput_secs": 113.61611267427078,
185
+ "duration_ms": 119534,
186
+ "time_to_first_token_ms": {
187
+ "p50": 56313.526,
188
+ "p60": 68465.8,
189
+ "p70": 78580.113,
190
+ "p80": 90639.114,
191
+ "p90": 102040.301,
192
+ "p95": 108031.928,
193
+ "p99": 112499.04,
194
+ "avg": 56639.341
195
+ },
196
+ "inter_token_latency_ms": {
197
+ "p50": 17.172,
198
+ "p60": 17.182,
199
+ "p70": 17.217,
200
+ "p80": 17.235,
201
+ "p90": 17.256,
202
+ "p95": 17.31,
203
+ "p99": 17.346,
204
+ "avg": 17.18
205
+ },
206
+ "failed_requests": 0,
207
+ "successful_requests": 70,
208
+ "request_rate": 0.5856069425814708,
209
+ "total_tokens_sent": 14000,
210
+ "e2e_latency_ms": {
211
+ "p50": 59683.651,
212
+ "p60": 71746.875,
213
+ "p70": 81953.181,
214
+ "p80": 94277.653,
215
+ "p90": 105378.271,
216
+ "p95": 111453.36,
217
+ "p99": 115949.496,
218
+ "avg": 59958.385
219
+ }
220
+ },
221
+ {
222
+ "id": "[email protected]/s",
223
+ "executor_type": "ConstantArrivalRate",
224
+ "config": {
225
+ "max_vus": 800,
226
+ "duration_secs": 120,
227
+ "rate": 100.0
228
+ },
229
+ "total_requests": 70,
230
+ "total_tokens": 13359,
231
+ "token_throughput_secs": 114.42379660997986,
232
+ "duration_ms": 116750,
233
+ "time_to_first_token_ms": {
234
+ "p50": 57218.949,
235
+ "p60": 67960.841,
236
+ "p70": 79764.715,
237
+ "p80": 91579.471,
238
+ "p90": 102620.956,
239
+ "p95": 107961.016,
240
+ "p99": 112866.279,
241
+ "avg": 56772.876
242
+ },
243
+ "inter_token_latency_ms": {
244
+ "p50": 17.171,
245
+ "p60": 17.189,
246
+ "p70": 17.201,
247
+ "p80": 17.215,
248
+ "p90": 17.245,
249
+ "p95": 17.299,
250
+ "p99": 17.353,
251
+ "avg": 17.179
252
+ },
253
+ "failed_requests": 0,
254
+ "successful_requests": 70,
255
+ "request_rate": 0.5995707584922966,
256
+ "total_tokens_sent": 14000,
257
+ "e2e_latency_ms": {
258
+ "p50": 60551.916,
259
+ "p60": 71380.408,
260
+ "p70": 83198.203,
261
+ "p80": 93909.886,
262
+ "p90": 105788.774,
263
+ "p95": 111364.807,
264
+ "p99": 115968.729,
265
+ "avg": 60037.39
266
+ }
267
+ }
268
+ ],
269
+ "start_time": "2025-05-21T12:32:04.299141299+00:00",
270
+ "end_time": "2025-05-21T12:47:52.695866821+00:00",
271
+ "system": {
272
+ "cpu": [
273
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu0@4699MHz",
274
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu1@4699MHz",
275
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu2@4699MHz",
276
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu3@4699MHz",
277
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu4@4699MHz",
278
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu5@4699MHz",
279
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu6@4699MHz",
280
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu7@4699MHz",
281
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu8@4699MHz",
282
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu9@4699MHz",
283
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu10@4699MHz",
284
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu11@4699MHz",
285
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu12@4699MHz",
286
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu13@4699MHz",
287
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu14@4699MHz",
288
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu15@4699MHz"
289
+ ],
290
+ "memory": "83.47 GB",
291
+ "os_name": "Debian GNU/Linux",
292
+ "os_version": "11",
293
+ "kernel": "5.15.167.4-microsoft-standard-WSL2",
294
+ "hostname": "computer"
295
+ }
296
+ }
results/microsoft_phi-4_2025-05-21-13-17-26.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "max_vus": 800,
4
+ "duration_secs": 120,
5
+ "benchmark_kind": "Rate",
6
+ "warmup_duration_secs": 30,
7
+ "rates": [
8
+ 1.0,
9
+ 10.0,
10
+ 30.0,
11
+ 100.0
12
+ ],
13
+ "num_rates": 10,
14
+ "prompt_options": {
15
+ "num_tokens": 200,
16
+ "min_tokens": 180,
17
+ "max_tokens": 220,
18
+ "variance": 10
19
+ },
20
+ "decode_options": {
21
+ "num_tokens": 200,
22
+ "min_tokens": 180,
23
+ "max_tokens": 220,
24
+ "variance": 10
25
+ },
26
+ "tokenizer": "microsoft/phi-4",
27
+ "model_name": "phi-4",
28
+ "profile": null,
29
+ "meta": null,
30
+ "run_id": "LM Studio: lmstudio-community/phi-4-GGUF:Q8_0 (200 tokens)"
31
+ },
32
+ "results": [
33
+ {
34
+ "id": "warmup",
35
+ "executor_type": "ConstantVUs",
36
+ "config": {
37
+ "max_vus": 1,
38
+ "duration_secs": 30,
39
+ "rate": null
40
+ },
41
+ "total_requests": 13,
42
+ "total_tokens": 2610,
43
+ "token_throughput_secs": 83.60700961692694,
44
+ "duration_ms": 31217,
45
+ "time_to_first_token_ms": {
46
+ "p50": 90.517,
47
+ "p60": 93.25,
48
+ "p70": 102.443,
49
+ "p80": 109.227,
50
+ "p90": 130.959,
51
+ "p95": 207.294,
52
+ "p99": 293.629,
53
+ "avg": 108.58
54
+ },
55
+ "inter_token_latency_ms": {
56
+ "p50": 11.513,
57
+ "p60": 11.519,
58
+ "p70": 11.534,
59
+ "p80": 11.548,
60
+ "p90": 11.559,
61
+ "p95": 11.574,
62
+ "p99": 11.589,
63
+ "avg": 11.472
64
+ },
65
+ "failed_requests": 0,
66
+ "successful_requests": 13,
67
+ "request_rate": 0.41643338123373574,
68
+ "total_tokens_sent": 2600,
69
+ "e2e_latency_ms": {
70
+ "p50": 2419.372,
71
+ "p60": 2423.796,
72
+ "p70": 2432.426,
73
+ "p80": 2458.236,
74
+ "p90": 2525.006,
75
+ "p95": 2596.86,
76
+ "p99": 2667.757,
77
+ "avg": 2401.195
78
+ }
79
+ },
80
+ {
81
+ "id": "[email protected]/s",
82
+ "executor_type": "ConstantArrivalRate",
83
+ "config": {
84
+ "max_vus": 800,
85
+ "duration_secs": 120,
86
+ "rate": 1.0
87
+ },
88
+ "total_requests": 52,
89
+ "total_tokens": 9915,
90
+ "token_throughput_secs": 84.1224984364473,
91
+ "duration_ms": 117863,
92
+ "time_to_first_token_ms": {
93
+ "p50": 31149.018,
94
+ "p60": 38159.307,
95
+ "p70": 44798.95,
96
+ "p80": 51599.01,
97
+ "p90": 58334.517,
98
+ "p95": 61414.588,
99
+ "p99": 63915.289,
100
+ "avg": 32379.62
101
+ },
102
+ "inter_token_latency_ms": {
103
+ "p50": 11.473,
104
+ "p60": 11.501,
105
+ "p70": 11.517,
106
+ "p80": 11.529,
107
+ "p90": 11.563,
108
+ "p95": 11.598,
109
+ "p99": 11.775,
110
+ "avg": 11.254
111
+ },
112
+ "failed_requests": 0,
113
+ "successful_requests": 52,
114
+ "request_rate": 0.4411870820670963,
115
+ "total_tokens_sent": 10400,
116
+ "e2e_latency_ms": {
117
+ "p50": 33388.263,
118
+ "p60": 40395.415,
119
+ "p70": 47230.795,
120
+ "p80": 53979.194,
121
+ "p90": 60382.07,
122
+ "p95": 63519.032,
123
+ "p99": 66184.234,
124
+ "avg": 34556.301
125
+ }
126
+ },
127
+ {
128
+ "id": "[email protected]/s",
129
+ "executor_type": "ConstantArrivalRate",
130
+ "config": {
131
+ "max_vus": 800,
132
+ "duration_secs": 120,
133
+ "rate": 10.0
134
+ },
135
+ "total_requests": 51,
136
+ "total_tokens": 10041,
137
+ "token_throughput_secs": 84.04049965954646,
138
+ "duration_ms": 119478,
139
+ "time_to_first_token_ms": {
140
+ "p50": 55889.645,
141
+ "p60": 67098.347,
142
+ "p70": 78905.359,
143
+ "p80": 90289.182,
144
+ "p90": 101201.112,
145
+ "p95": 106805.272,
146
+ "p99": 111193.127,
147
+ "avg": 56139.066
148
+ },
149
+ "inter_token_latency_ms": {
150
+ "p50": 11.487,
151
+ "p60": 11.498,
152
+ "p70": 11.51,
153
+ "p80": 11.536,
154
+ "p90": 11.584,
155
+ "p95": 11.638,
156
+ "p99": 11.883,
157
+ "avg": 11.474
158
+ },
159
+ "failed_requests": 0,
160
+ "successful_requests": 51,
161
+ "request_rate": 0.4268564368725096,
162
+ "total_tokens_sent": 10200,
163
+ "e2e_latency_ms": {
164
+ "p50": 58084.912,
165
+ "p60": 69432.711,
166
+ "p70": 81080.254,
167
+ "p80": 92442.614,
168
+ "p90": 103527.041,
169
+ "p95": 108999.672,
170
+ "p99": 113397.637,
171
+ "avg": 58387.662
172
+ }
173
+ },
174
+ {
175
+ "id": "[email protected]/s",
176
+ "executor_type": "ConstantArrivalRate",
177
+ "config": {
178
+ "max_vus": 800,
179
+ "duration_secs": 120,
180
+ "rate": 30.0
181
+ },
182
+ "total_requests": 51,
183
+ "total_tokens": 9889,
184
+ "token_throughput_secs": 84.08188681268076,
185
+ "duration_ms": 117611,
186
+ "time_to_first_token_ms": {
187
+ "p50": 55982.506,
188
+ "p60": 68000.692,
189
+ "p70": 79600.152,
190
+ "p80": 91108.706,
191
+ "p90": 101995.453,
192
+ "p95": 107929.312,
193
+ "p99": 112340.212,
194
+ "avg": 56754.648
195
+ },
196
+ "inter_token_latency_ms": {
197
+ "p50": 11.503,
198
+ "p60": 11.515,
199
+ "p70": 11.531,
200
+ "p80": 11.564,
201
+ "p90": 11.589,
202
+ "p95": 11.633,
203
+ "p99": 11.795,
204
+ "avg": 11.477
205
+ },
206
+ "failed_requests": 0,
207
+ "successful_requests": 51,
208
+ "request_rate": 0.43363092602353315,
209
+ "total_tokens_sent": 10200,
210
+ "e2e_latency_ms": {
211
+ "p50": 58352.067,
212
+ "p60": 70321.743,
213
+ "p70": 81960.377,
214
+ "p80": 93288.338,
215
+ "p90": 104277.554,
216
+ "p95": 110084.734,
217
+ "p99": 114675.842,
218
+ "avg": 58969.412
219
+ }
220
+ },
221
+ {
222
+ "id": "[email protected]/s",
223
+ "executor_type": "ConstantArrivalRate",
224
+ "config": {
225
+ "max_vus": 800,
226
+ "duration_secs": 120,
227
+ "rate": 100.0
228
+ },
229
+ "total_requests": 57,
230
+ "total_tokens": 9983,
231
+ "token_throughput_secs": 83.83914212119033,
232
+ "duration_ms": 119073,
233
+ "time_to_first_token_ms": {
234
+ "p50": 60425.652,
235
+ "p60": 73426.16,
236
+ "p70": 83375.468,
237
+ "p80": 96034.495,
238
+ "p90": 104082.959,
239
+ "p95": 110616.366,
240
+ "p99": 114826.821,
241
+ "avg": 59050.64
242
+ },
243
+ "inter_token_latency_ms": {
244
+ "p50": 11.528,
245
+ "p60": 11.552,
246
+ "p70": 11.577,
247
+ "p80": 11.595,
248
+ "p90": 11.625,
249
+ "p95": 11.656,
250
+ "p99": 11.7,
251
+ "avg": 11.281
252
+ },
253
+ "failed_requests": 0,
254
+ "successful_requests": 57,
255
+ "request_rate": 0.4786968948119652,
256
+ "total_tokens_sent": 11400,
257
+ "e2e_latency_ms": {
258
+ "p50": 62519.008,
259
+ "p60": 74991.853,
260
+ "p70": 85562.76,
261
+ "p80": 96625.366,
262
+ "p90": 106351.421,
263
+ "p95": 112531.399,
264
+ "p99": 117196.304,
265
+ "avg": 61050.657
266
+ }
267
+ }
268
+ ],
269
+ "start_time": "2025-05-21T13:01:17.074891817+00:00",
270
+ "end_time": "2025-05-21T13:17:26.396424745+00:00",
271
+ "system": {
272
+ "cpu": [
273
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu0@4699MHz",
274
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu1@4699MHz",
275
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu2@4699MHz",
276
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu3@4699MHz",
277
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu4@4699MHz",
278
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu5@4699MHz",
279
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu6@4699MHz",
280
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu7@4699MHz",
281
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu8@4699MHz",
282
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu9@4699MHz",
283
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu10@4699MHz",
284
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu11@4699MHz",
285
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu12@4699MHz",
286
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu13@4699MHz",
287
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu14@4699MHz",
288
+ "AMD Ryzen 7 9800X3D 8-Core Processor cpu15@4699MHz"
289
+ ],
290
+ "memory": "83.47 GB",
291
+ "os_name": "Debian GNU/Linux",
292
+ "os_version": "11",
293
+ "kernel": "5.15.167.4-microsoft-standard-WSL2",
294
+ "hostname": "computer"
295
+ }
296
+ }