Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- artifacts/gptoss_run/gptoss_results.json +12 -0
- artifacts/megablocks_run/megablocks_results.json +12 -0
- artifacts/visualization/small_moe_comparison.png +3 -0
- cells/__pycache__/config.cpython-312.pyc +0 -0
- cells/__pycache__/utils.cpython-312.pyc +0 -0
- cells/config.py +35 -0
- cells/gptoss_run.py +149 -0
- cells/megablocks_run.py +112 -0
- cells/nvidia_dump.py +21 -0
- cells/save_data.py +67 -0
- cells/utils.py +143 -0
- cells/visualization.py +168 -0
- index.html +0 -0
- small_compare.html +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
artifacts/visualization/small_moe_comparison.png filter=lfs diff=lfs merge=lfs -text
|
artifacts/gptoss_run/gptoss_results.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"avg_time_ms": 62.308485079556704,
|
3 |
+
"throughput_tokens_per_sec": 65737.43519474348,
|
4 |
+
"memory_allocated_gb": 1.329831600189209,
|
5 |
+
"memory_cached_gb": 1.8359375,
|
6 |
+
"memory_increase_gb": 0.3795137405395508,
|
7 |
+
"device": "cuda",
|
8 |
+
"dtype": "torch.bfloat16",
|
9 |
+
"tokens": 4096,
|
10 |
+
"warmup_iters": 10,
|
11 |
+
"timing_iters": 50
|
12 |
+
}
|
artifacts/megablocks_run/megablocks_results.json
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"avg_time_ms": 26.93254135781899,
|
3 |
+
"throughput_tokens_per_sec": 152083.67994618745,
|
4 |
+
"memory_allocated_gb": 2.2425241470336914,
|
5 |
+
"memory_cached_gb": 4.14453125,
|
6 |
+
"memory_increase_gb": 1.2922062873840332,
|
7 |
+
"device": "cuda",
|
8 |
+
"dtype": "torch.bfloat16",
|
9 |
+
"tokens": 4096,
|
10 |
+
"warmup_iters": 10,
|
11 |
+
"timing_iters": 50
|
12 |
+
}
|
artifacts/visualization/small_moe_comparison.png
ADDED
![]() |
Git LFS Details
|
cells/__pycache__/config.cpython-312.pyc
ADDED
Binary file (1.03 kB). View file
|
|
cells/__pycache__/utils.cpython-312.pyc
ADDED
Binary file (7.31 kB). View file
|
|
cells/config.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# "numpy",
|
5 |
+
# ]
|
6 |
+
# ///
|
7 |
+
|
8 |
+
"""Configuration for MoE benchmarks."""
|
9 |
+
import torch
|
10 |
+
|
11 |
+
# Model configuration
|
12 |
+
NUM_EXPERTS = 128
|
13 |
+
HIDDEN_SIZE = 1152
|
14 |
+
TOP_K = 4
|
15 |
+
|
16 |
+
# Benchmark configuration
|
17 |
+
BATCH_SIZE = 8
|
18 |
+
SEQ_LEN = 512
|
19 |
+
DTYPE = "bfloat16"
|
20 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
21 |
+
|
22 |
+
# Seeds for reproducibility
|
23 |
+
WEIGHT_SEED = 999
|
24 |
+
EXPERT_SEED = 777
|
25 |
+
INPUT_SEED = 123
|
26 |
+
GENERAL_SEED = 42
|
27 |
+
|
28 |
+
print(f"Configuration:")
|
29 |
+
print(f" Experts: {NUM_EXPERTS}")
|
30 |
+
print(f" Hidden size: {HIDDEN_SIZE}")
|
31 |
+
print(f" Top-k: {TOP_K}")
|
32 |
+
print(f" Batch size: {BATCH_SIZE}")
|
33 |
+
print(f" Sequence length: {SEQ_LEN}")
|
34 |
+
print(f" Device: {DEVICE}")
|
35 |
+
print(f" Dtype: {DTYPE}")
|
cells/gptoss_run.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# "numpy",
|
5 |
+
# ]
|
6 |
+
# ///
|
7 |
+
|
8 |
+
import torch
|
9 |
+
from torch import nn
|
10 |
+
from torch.nn import functional as F
|
11 |
+
from utils import to_dtype, tensor_stats, set_seed, bench_context
|
12 |
+
from config import (
|
13 |
+
NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
|
14 |
+
BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
|
15 |
+
WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
|
16 |
+
)
|
17 |
+
from pathlib import Path
|
18 |
+
import os
|
19 |
+
|
20 |
+
# Discover the upstream artifact directory from env
|
21 |
+
data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
|
22 |
+
|
23 |
+
# list all the files in the directory
|
24 |
+
print(f"Loading weights from: {data_dir}")
|
25 |
+
print(f"Files in directory: {list(Path(data_dir).glob('*'))}")
|
26 |
+
|
27 |
+
router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
|
28 |
+
router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
|
29 |
+
gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
|
30 |
+
gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
|
31 |
+
down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
|
32 |
+
down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
|
33 |
+
|
34 |
+
print("Loaded shared weights from artifacts")
|
35 |
+
print(f"Router weight sum: {router_weight.sum().item():.6f}")
|
36 |
+
print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
|
37 |
+
print(f"Down sum: {down_proj.sum().item():.6f}")
|
38 |
+
|
39 |
+
class GptOssRouter(nn.Module):
|
40 |
+
def __init__(self, router_weight, router_bias):
|
41 |
+
super().__init__()
|
42 |
+
self.top_k = TOP_K
|
43 |
+
self.num_experts = NUM_EXPERTS
|
44 |
+
self.hidden_dim = HIDDEN_SIZE
|
45 |
+
self.weight = nn.Parameter(router_weight.clone())
|
46 |
+
self.bias = nn.Parameter(router_bias.clone())
|
47 |
+
|
48 |
+
def forward(self, hidden_states):
|
49 |
+
hidden_states = hidden_states.reshape(-1, self.hidden_dim)
|
50 |
+
router_logits = F.linear(hidden_states, self.weight, self.bias)
|
51 |
+
router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
|
52 |
+
router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
|
53 |
+
router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
|
54 |
+
return router_scores, router_indices
|
55 |
+
|
56 |
+
class GptOssExperts(nn.Module):
|
57 |
+
def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
|
58 |
+
super().__init__()
|
59 |
+
self.num_experts = NUM_EXPERTS
|
60 |
+
self.hidden_size = HIDDEN_SIZE
|
61 |
+
self.expert_dim = self.hidden_size
|
62 |
+
self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
|
63 |
+
self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
|
64 |
+
self.down_proj = nn.Parameter(down_proj.clone())
|
65 |
+
self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
|
66 |
+
self.alpha = 1.702
|
67 |
+
self.limit = 7.0
|
68 |
+
|
69 |
+
def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
|
70 |
+
batch_size = hidden_states.shape[0]
|
71 |
+
hidden_states = hidden_states.reshape(-1, self.hidden_size)
|
72 |
+
num_experts = routing_weights.shape[1]
|
73 |
+
|
74 |
+
if hidden_states.device.type == "cpu" or self.training:
|
75 |
+
next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
|
76 |
+
with torch.no_grad():
|
77 |
+
expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
|
78 |
+
expert_mask = expert_mask.permute(2, 1, 0)
|
79 |
+
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
|
80 |
+
|
81 |
+
for expert_idx in expert_hit[:]:
|
82 |
+
expert_idx = expert_idx[0]
|
83 |
+
with torch.no_grad():
|
84 |
+
_, token_idx = torch.where(expert_mask[expert_idx])
|
85 |
+
current_state = hidden_states[token_idx]
|
86 |
+
gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
|
87 |
+
gate, up = gate_up[..., ::2], gate_up[..., 1::2]
|
88 |
+
gate = gate.clamp(min=None, max=self.limit)
|
89 |
+
up = up.clamp(min=-self.limit, max=self.limit)
|
90 |
+
glu = gate * torch.sigmoid(gate * self.alpha)
|
91 |
+
gated_output = (up + 1) * glu
|
92 |
+
out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
|
93 |
+
weighted_output = out * routing_weights[token_idx, expert_idx, None]
|
94 |
+
next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
|
95 |
+
next_states = next_states.view(batch_size, -1, self.hidden_size)
|
96 |
+
else:
|
97 |
+
hidden_states = hidden_states.repeat(num_experts, 1)
|
98 |
+
hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
|
99 |
+
gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
|
100 |
+
gate, up = gate_up[..., ::2], gate_up[..., 1::2]
|
101 |
+
gate = gate.clamp(min=None, max=self.limit)
|
102 |
+
up = up.clamp(min=-self.limit, max=self.limit)
|
103 |
+
glu = gate * torch.sigmoid(gate * self.alpha)
|
104 |
+
next_states = torch.bmm(((up + 1) * glu), self.down_proj)
|
105 |
+
next_states = next_states + self.down_proj_bias[..., None, :]
|
106 |
+
next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
|
107 |
+
next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
|
108 |
+
next_states = next_states.sum(dim=0)
|
109 |
+
return next_states
|
110 |
+
|
111 |
+
class GptOssMoEMLP(nn.Module):
|
112 |
+
def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
|
113 |
+
super().__init__()
|
114 |
+
self.router = GptOssRouter(router_weight, router_bias)
|
115 |
+
self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
|
116 |
+
|
117 |
+
def forward(self, hidden_states):
|
118 |
+
router_scores, router_indices = self.router(hidden_states)
|
119 |
+
routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
|
120 |
+
return routed_out, router_scores
|
121 |
+
|
122 |
+
# Run the model
|
123 |
+
set_seed(GENERAL_SEED)
|
124 |
+
|
125 |
+
device = torch.device(DEVICE)
|
126 |
+
dtype = to_dtype(DTYPE)
|
127 |
+
|
128 |
+
print("\n=== GPT-OSS Implementation ===")
|
129 |
+
# Initialize model with loaded weights
|
130 |
+
model = GptOssMoEMLP(
|
131 |
+
router_weight.to(device, dtype=dtype),
|
132 |
+
router_bias.to(device, dtype=dtype),
|
133 |
+
gate_up_proj.to(device, dtype=dtype),
|
134 |
+
gate_up_proj_bias.to(device, dtype=dtype),
|
135 |
+
down_proj.to(device, dtype=dtype),
|
136 |
+
down_proj_bias.to(device, dtype=dtype)
|
137 |
+
).to(device=device, dtype=dtype)
|
138 |
+
|
139 |
+
print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
|
140 |
+
print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
|
141 |
+
print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
|
142 |
+
|
143 |
+
# Benchmark the model using different input tensors on each iteration
|
144 |
+
tokens = BATCH_SIZE * SEQ_LEN
|
145 |
+
input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
|
146 |
+
with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
|
147 |
+
save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
|
148 |
+
output, stats = bench(model)
|
149 |
+
print(f"\nOutput sum: {output[0].sum().item():.6f}")
|
cells/megablocks_run.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# "numpy",
|
5 |
+
# "kernels",
|
6 |
+
# ]
|
7 |
+
# ///
|
8 |
+
|
9 |
+
import torch
|
10 |
+
from torch import nn
|
11 |
+
from torch.nn import functional as F
|
12 |
+
from kernels import get_kernel, get_local_kernel
|
13 |
+
from utils import to_dtype, tensor_stats, set_seed, bench_context
|
14 |
+
from config import (
|
15 |
+
NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
|
16 |
+
BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
|
17 |
+
WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
|
18 |
+
)
|
19 |
+
from pathlib import Path
|
20 |
+
from collections import namedtuple
|
21 |
+
import os
|
22 |
+
|
23 |
+
# Discover the upstream artifact directory from env
|
24 |
+
data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
|
25 |
+
|
26 |
+
print(f"Loading weights from: {data_dir}")
|
27 |
+
|
28 |
+
router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
|
29 |
+
router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
|
30 |
+
gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
|
31 |
+
gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
|
32 |
+
down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
|
33 |
+
down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
|
34 |
+
|
35 |
+
print("Loaded shared weights from artifacts")
|
36 |
+
print(f"Router weight sum: {router_weight.sum().item():.6f}")
|
37 |
+
print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
|
38 |
+
print(f"Down sum: {down_proj.sum().item():.6f}")
|
39 |
+
|
40 |
+
def build_megablocks_model(device: torch.device, dtype: torch.dtype):
|
41 |
+
# Download optimized kernels from the Hugging Face hub
|
42 |
+
megablocks = get_kernel("kernels-community/megablocks")
|
43 |
+
|
44 |
+
# megablocks = get_local_kernel(
|
45 |
+
# Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks")
|
46 |
+
|
47 |
+
model = megablocks.layers.MegaBlocksMoeMLP()
|
48 |
+
|
49 |
+
# Create attribute container for expert weights
|
50 |
+
model.experts = namedtuple(
|
51 |
+
"Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
|
52 |
+
)
|
53 |
+
|
54 |
+
# Use loaded router weights for consistency
|
55 |
+
model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype)
|
56 |
+
with torch.no_grad():
|
57 |
+
model.router.weight.copy_(router_weight.to(dtype))
|
58 |
+
model.router.bias.copy_(router_bias.to(dtype))
|
59 |
+
|
60 |
+
# Attach loaded expert weights to the experts container
|
61 |
+
e = model.experts
|
62 |
+
e.alpha = 1.702
|
63 |
+
e.capacity_factor = 4
|
64 |
+
e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype))
|
65 |
+
e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype))
|
66 |
+
e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype))
|
67 |
+
e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype))
|
68 |
+
e.hidden_size = HIDDEN_SIZE
|
69 |
+
|
70 |
+
# Log weight statistics for comparison
|
71 |
+
print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
|
72 |
+
print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
|
73 |
+
print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
|
74 |
+
|
75 |
+
return model
|
76 |
+
|
77 |
+
# Create a wrapper to match the interface of other implementations
|
78 |
+
class MegaBlocksMoEWrapper(nn.Module):
|
79 |
+
def __init__(self, megablocks_model):
|
80 |
+
super().__init__()
|
81 |
+
self.model = megablocks_model
|
82 |
+
|
83 |
+
def forward(self, hidden_states):
|
84 |
+
# MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
|
85 |
+
output, dummy_routing_weights = self.model(hidden_states)
|
86 |
+
# Return output and dummy routing weights for consistency with other implementations
|
87 |
+
# dummy_routing_weights = torch.zeros(
|
88 |
+
# hidden_states.shape[0] * hidden_states.shape[1],
|
89 |
+
# NUM_EXPERTS,
|
90 |
+
# device=hidden_states.device,
|
91 |
+
# dtype=hidden_states.dtype
|
92 |
+
# )
|
93 |
+
return output, dummy_routing_weights
|
94 |
+
|
95 |
+
# Run the model
|
96 |
+
set_seed(GENERAL_SEED)
|
97 |
+
|
98 |
+
device = torch.device(DEVICE)
|
99 |
+
dtype = to_dtype(DTYPE)
|
100 |
+
|
101 |
+
print("\n=== MegaBlocks Implementation ===")
|
102 |
+
# Build MegaBlocks model with loaded weights
|
103 |
+
megablocks_model = build_megablocks_model(device, dtype)
|
104 |
+
model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype)
|
105 |
+
|
106 |
+
# Benchmark the model using different input tensors on each iteration
|
107 |
+
tokens = BATCH_SIZE * SEQ_LEN
|
108 |
+
input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
|
109 |
+
with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
|
110 |
+
save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
|
111 |
+
output, stats = bench(model)
|
112 |
+
print(f"\nOutput sum: {output[0].sum().item():.6f}")
|
cells/nvidia_dump.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# ]
|
5 |
+
# ///
|
6 |
+
|
7 |
+
"""Utility to dump NVIDIA GPU information."""
|
8 |
+
import subprocess
|
9 |
+
|
10 |
+
def nvidia_dump():
|
11 |
+
"""Dump NVIDIA GPU information."""
|
12 |
+
try:
|
13 |
+
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True)
|
14 |
+
print("NVIDIA GPU Information:")
|
15 |
+
print(result.stdout)
|
16 |
+
except FileNotFoundError:
|
17 |
+
print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?")
|
18 |
+
except subprocess.CalledProcessError as e:
|
19 |
+
print(f"Error running nvidia-smi: {e}")
|
20 |
+
|
21 |
+
nvidia_dump()
|
cells/save_data.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# "numpy",
|
5 |
+
# ]
|
6 |
+
# ///
|
7 |
+
|
8 |
+
"""Generate and save shared weights for consistent comparison."""
|
9 |
+
import torch
|
10 |
+
import numpy as np
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
# Model configuration
|
14 |
+
NUM_EXPERTS = 128
|
15 |
+
HIDDEN_SIZE = 1152
|
16 |
+
INTERMEDIATE_SIZE = 3072
|
17 |
+
TOP_K = 4
|
18 |
+
|
19 |
+
# Input configuration
|
20 |
+
BATCH_SIZE = 1
|
21 |
+
SEQ_LEN = 100
|
22 |
+
DTYPE = "float32"
|
23 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
24 |
+
|
25 |
+
# Seeds for reproducibility
|
26 |
+
WEIGHT_SEED = 999
|
27 |
+
EXPERT_SEED = 777
|
28 |
+
INPUT_SEED = 123
|
29 |
+
GENERAL_SEED = 42
|
30 |
+
|
31 |
+
def set_seed(seed: int):
|
32 |
+
"""Set seeds for reproducibility."""
|
33 |
+
torch.manual_seed(seed)
|
34 |
+
np.random.seed(seed)
|
35 |
+
if torch.cuda.is_available():
|
36 |
+
torch.cuda.manual_seed(seed)
|
37 |
+
torch.cuda.manual_seed_all(seed)
|
38 |
+
|
39 |
+
# Generate shared weights for all implementations
|
40 |
+
print("Generating shared weights...")
|
41 |
+
|
42 |
+
# Router weights
|
43 |
+
set_seed(WEIGHT_SEED)
|
44 |
+
router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
|
45 |
+
torch.nn.init.kaiming_uniform_(router_weight)
|
46 |
+
router_bias = torch.zeros(NUM_EXPERTS)
|
47 |
+
|
48 |
+
# Expert weights - using proper dimensions for gate/up combined projection
|
49 |
+
set_seed(EXPERT_SEED)
|
50 |
+
gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
|
51 |
+
gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
|
52 |
+
down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
|
53 |
+
down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
|
54 |
+
|
55 |
+
# Save weights
|
56 |
+
torch.save(router_weight, 'router_weight.pt')
|
57 |
+
torch.save(router_bias, 'router_bias.pt')
|
58 |
+
torch.save(gate_up_proj, 'gate_up_proj.pt')
|
59 |
+
torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
|
60 |
+
torch.save(down_proj, 'down_proj.pt')
|
61 |
+
torch.save(down_proj_bias, 'down_proj_bias.pt')
|
62 |
+
|
63 |
+
print(f"Saved weights:")
|
64 |
+
print(f" Router: {tuple(router_weight.shape)}")
|
65 |
+
print(f" Gate/Up proj: {tuple(gate_up_proj.shape)}")
|
66 |
+
print(f" Down proj: {tuple(down_proj.shape)}")
|
67 |
+
print(f" Hidden size: {HIDDEN_SIZE}")
|
cells/utils.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "torch",
|
4 |
+
# "numpy",
|
5 |
+
# ]
|
6 |
+
# ///
|
7 |
+
|
8 |
+
"""Simple utilities for running the models."""
|
9 |
+
import torch
|
10 |
+
|
11 |
+
def to_dtype(dtype_str: str):
|
12 |
+
"""Convert string to torch dtype."""
|
13 |
+
if dtype_str == "float16":
|
14 |
+
return torch.float16
|
15 |
+
if dtype_str == "bfloat16":
|
16 |
+
return torch.bfloat16
|
17 |
+
return torch.float32
|
18 |
+
|
19 |
+
def tensor_stats(t: torch.Tensor) -> str:
|
20 |
+
"""Generate stats string for a tensor."""
|
21 |
+
return (f"shape={tuple(t.shape)}, "
|
22 |
+
f"dtype={t.dtype}, "
|
23 |
+
f"device={t.device}, "
|
24 |
+
f"mean={t.mean().item():.6f}, "
|
25 |
+
f"std={t.std().item():.6f}")
|
26 |
+
|
27 |
+
def set_seed(seed: int):
|
28 |
+
"""Set seeds for reproducibility."""
|
29 |
+
torch.manual_seed(seed)
|
30 |
+
if torch.cuda.is_available():
|
31 |
+
torch.cuda.manual_seed(seed)
|
32 |
+
torch.cuda.manual_seed_all(seed)
|
33 |
+
torch.backends.cudnn.deterministic = True
|
34 |
+
torch.backends.cudnn.benchmark = False
|
35 |
+
|
36 |
+
"""Reusable benchmarking utilities for performance testing."""
|
37 |
+
import time
|
38 |
+
import numpy as np
|
39 |
+
from contextlib import contextmanager
|
40 |
+
from typing import Callable, Dict, Tuple, Any, Optional
|
41 |
+
import torch
|
42 |
+
import json
|
43 |
+
|
44 |
+
def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20,
|
45 |
+
input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
|
46 |
+
"""High precision timing function with warmup and optional input generation per iteration."""
|
47 |
+
# Warmup
|
48 |
+
for i in range(warmup):
|
49 |
+
if input_generator:
|
50 |
+
inputs = input_generator(i)
|
51 |
+
func(inputs)
|
52 |
+
else:
|
53 |
+
func()
|
54 |
+
|
55 |
+
if torch.cuda.is_available():
|
56 |
+
torch.cuda.synchronize()
|
57 |
+
|
58 |
+
start = time.perf_counter()
|
59 |
+
result = None
|
60 |
+
for i in range(iters):
|
61 |
+
if input_generator:
|
62 |
+
inputs = input_generator(i + warmup) # Continue seed sequence after warmup
|
63 |
+
result = func(inputs)
|
64 |
+
else:
|
65 |
+
result = func()
|
66 |
+
|
67 |
+
if torch.cuda.is_available():
|
68 |
+
torch.cuda.synchronize()
|
69 |
+
|
70 |
+
end = time.perf_counter()
|
71 |
+
avg_time = (end - start) / iters
|
72 |
+
return result, avg_time
|
73 |
+
|
74 |
+
def memory_usage() -> Dict[str, float]:
|
75 |
+
"""Get current memory usage in GB."""
|
76 |
+
if not torch.cuda.is_available():
|
77 |
+
return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}
|
78 |
+
|
79 |
+
return {
|
80 |
+
"allocated": torch.cuda.memory_allocated() / 1024**3,
|
81 |
+
"cached": torch.cuda.memory_reserved() / 1024**3,
|
82 |
+
"max_allocated": torch.cuda.max_memory_allocated() / 1024**3
|
83 |
+
}
|
84 |
+
|
85 |
+
@contextmanager
|
86 |
+
def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None,
|
87 |
+
tokens: int = None, save_json: Optional[str] = None,
|
88 |
+
input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
|
89 |
+
"""Context manager for benchmarking with comprehensive metrics and optional input generation."""
|
90 |
+
|
91 |
+
def run_benchmark(model_func, *args, **kwargs):
|
92 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
93 |
+
|
94 |
+
mem_before = memory_usage()
|
95 |
+
|
96 |
+
# Create input generator if input_shape is provided
|
97 |
+
input_generator = None
|
98 |
+
if input_shape is not None:
|
99 |
+
def create_input(iteration: int):
|
100 |
+
# Use deterministic but different seed for each iteration
|
101 |
+
iteration_seed = input_seed_base + iteration * 123 # Spread out seeds
|
102 |
+
torch.manual_seed(iteration_seed)
|
103 |
+
if torch.cuda.is_available():
|
104 |
+
torch.cuda.manual_seed(iteration_seed)
|
105 |
+
return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
|
106 |
+
input_generator = create_input
|
107 |
+
|
108 |
+
if input_generator:
|
109 |
+
result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
|
110 |
+
else:
|
111 |
+
result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)
|
112 |
+
|
113 |
+
mem_after = memory_usage()
|
114 |
+
|
115 |
+
# Calculate metrics
|
116 |
+
metrics = {
|
117 |
+
"avg_time_ms": avg_time * 1000,
|
118 |
+
"throughput_tokens_per_sec": tokens / avg_time if tokens else None,
|
119 |
+
"memory_allocated_gb": mem_after["allocated"],
|
120 |
+
"memory_cached_gb": mem_after["cached"],
|
121 |
+
"memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
|
122 |
+
"device": str(device) if device else "cpu",
|
123 |
+
"dtype": str(dtype) if dtype else "float32",
|
124 |
+
"tokens": tokens,
|
125 |
+
"warmup_iters": warmup,
|
126 |
+
"timing_iters": iters
|
127 |
+
}
|
128 |
+
|
129 |
+
# Print results
|
130 |
+
print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
|
131 |
+
if tokens:
|
132 |
+
print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
|
133 |
+
print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
|
134 |
+
print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")
|
135 |
+
|
136 |
+
# Save to JSON if requested
|
137 |
+
if save_json:
|
138 |
+
with open(save_json, 'w') as f:
|
139 |
+
json.dump(metrics, f, indent=2)
|
140 |
+
|
141 |
+
return result
|
142 |
+
|
143 |
+
yield run_benchmark
|
cells/visualization.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# /// script
|
2 |
+
# dependencies = [
|
3 |
+
# "matplotlib",
|
4 |
+
# ]
|
5 |
+
# ///
|
6 |
+
|
7 |
+
import json
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import numpy as np
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
|
13 |
+
# Get result directories from environment variables
|
14 |
+
gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.')
|
15 |
+
megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.')
|
16 |
+
|
17 |
+
print(f"Loading benchmark results from:")
|
18 |
+
print(f" GPT-OSS dir: {gptoss_dir}")
|
19 |
+
print(f" MegaBlocks dir: {megablocks_dir}")
|
20 |
+
|
21 |
+
# Load benchmark results
|
22 |
+
gptoss_file = Path(gptoss_dir) / 'gptoss_results.json'
|
23 |
+
megablocks_file = Path(megablocks_dir) / 'megablocks_results.json'
|
24 |
+
|
25 |
+
print(f"Loading results from:")
|
26 |
+
print(f" GPT-OSS: {gptoss_file}")
|
27 |
+
print(f" MegaBlocks: {megablocks_file}")
|
28 |
+
|
29 |
+
if not gptoss_file.exists():
|
30 |
+
print(f"Warning: {gptoss_file} not found")
|
31 |
+
if not megablocks_file.exists():
|
32 |
+
print(f"Warning: {megablocks_file} not found")
|
33 |
+
|
34 |
+
with open(gptoss_file, 'r') as f:
|
35 |
+
gptoss_results = json.load(f)
|
36 |
+
|
37 |
+
with open(megablocks_file, 'r') as f:
|
38 |
+
megablocks_results = json.load(f)
|
39 |
+
|
40 |
+
print(f"GPT-OSS results keys: {list(gptoss_results.keys())}")
|
41 |
+
print(f"MegaBlocks results keys: {list(megablocks_results.keys())}")
|
42 |
+
|
43 |
+
# Helper function to extract metrics from either old or new JSON format
|
44 |
+
def get_metric(results, metric_name, default=0):
|
45 |
+
"""Extract metric from results, handling both old and new JSON formats."""
|
46 |
+
# New format (with stats dict)
|
47 |
+
if 'stats' in results:
|
48 |
+
return results['stats'].get(metric_name, default)
|
49 |
+
# Old format (direct keys)
|
50 |
+
elif metric_name in results:
|
51 |
+
return results[metric_name]
|
52 |
+
else:
|
53 |
+
return default
|
54 |
+
|
55 |
+
# Create comparison plots
|
56 |
+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
|
57 |
+
|
58 |
+
# Performance comparison
|
59 |
+
implementations = ['GPT-OSS', 'MegaBlocks']
|
60 |
+
|
61 |
+
# Extract timing metrics (handle both avg_ms and avg_time_ms)
|
62 |
+
gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0))
|
63 |
+
mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0))
|
64 |
+
times = [gpt_time, mega_time]
|
65 |
+
|
66 |
+
# Extract throughput metrics
|
67 |
+
gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0))
|
68 |
+
mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0))
|
69 |
+
throughputs = [gpt_throughput, mega_throughput]
|
70 |
+
|
71 |
+
# Extract memory metrics
|
72 |
+
gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0)
|
73 |
+
mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0)
|
74 |
+
memory_usage = [gpt_memory, mega_memory]
|
75 |
+
|
76 |
+
gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0)
|
77 |
+
mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0)
|
78 |
+
memory_increase = [gpt_mem_inc, mega_mem_inc]
|
79 |
+
|
80 |
+
print(f"Extracted metrics:")
|
81 |
+
print(f" Times (ms): {times}")
|
82 |
+
print(f" Throughputs: {throughputs}")
|
83 |
+
print(f" Memory usage (GB): {memory_usage}")
|
84 |
+
print(f" Memory increase (GB): {memory_increase}")
|
85 |
+
|
86 |
+
colors = ['#2E8B57', '#4169E1']
|
87 |
+
|
88 |
+
# Latency comparison
|
89 |
+
bars1 = ax1.bar(implementations, times, color=colors)
|
90 |
+
ax1.set_ylabel('Average Time (ms)')
|
91 |
+
ax1.set_title('Latency Comparison')
|
92 |
+
ax1.grid(True, alpha=0.3)
|
93 |
+
|
94 |
+
# Add values on bars
|
95 |
+
for bar, time in zip(bars1, times):
|
96 |
+
height = bar.get_height()
|
97 |
+
ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
|
98 |
+
f'{time:.2f}ms', ha='center', va='bottom')
|
99 |
+
|
100 |
+
# Throughput comparison
|
101 |
+
bars2 = ax2.bar(implementations, throughputs, color=colors)
|
102 |
+
ax2.set_ylabel('Tokens per Second')
|
103 |
+
ax2.set_title('Throughput Comparison')
|
104 |
+
ax2.grid(True, alpha=0.3)
|
105 |
+
|
106 |
+
# Add values on bars
|
107 |
+
for bar, throughput in zip(bars2, throughputs):
|
108 |
+
height = bar.get_height()
|
109 |
+
ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
|
110 |
+
f'{throughput:.0f}', ha='center', va='bottom')
|
111 |
+
|
112 |
+
# Memory usage comparison
|
113 |
+
bars3 = ax3.bar(implementations, memory_usage, color=colors)
|
114 |
+
ax3.set_ylabel('Memory Allocated (GB)')
|
115 |
+
ax3.set_title('Memory Usage Comparison')
|
116 |
+
ax3.grid(True, alpha=0.3)
|
117 |
+
|
118 |
+
# Add values on bars
|
119 |
+
for bar, mem in zip(bars3, memory_usage):
|
120 |
+
height = bar.get_height()
|
121 |
+
ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
|
122 |
+
f'{mem:.2f}GB', ha='center', va='bottom')
|
123 |
+
|
124 |
+
# Memory increase comparison
|
125 |
+
bars4 = ax4.bar(implementations, memory_increase, color=colors)
|
126 |
+
ax4.set_ylabel('Memory Increase (GB)')
|
127 |
+
ax4.set_title('Memory Increase Comparison')
|
128 |
+
ax4.grid(True, alpha=0.3)
|
129 |
+
|
130 |
+
# Add values on bars
|
131 |
+
for bar, mem_inc in zip(bars4, memory_increase):
|
132 |
+
height = bar.get_height()
|
133 |
+
ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
|
134 |
+
f'{mem_inc:.3f}GB', ha='center', va='bottom')
|
135 |
+
|
136 |
+
plt.tight_layout()
|
137 |
+
plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight')
|
138 |
+
plt.show()
|
139 |
+
|
140 |
+
# Print summary table
|
141 |
+
print("\n" + "="*60)
|
142 |
+
print("PERFORMANCE COMPARISON SUMMARY")
|
143 |
+
print("="*60)
|
144 |
+
print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}")
|
145 |
+
print("-" * 60)
|
146 |
+
|
147 |
+
# Determine winners
|
148 |
+
latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks"
|
149 |
+
throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks"
|
150 |
+
memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks"
|
151 |
+
mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks"
|
152 |
+
|
153 |
+
print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}")
|
154 |
+
print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}")
|
155 |
+
print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}")
|
156 |
+
print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}")
|
157 |
+
|
158 |
+
# Speed ratio
|
159 |
+
speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1]
|
160 |
+
faster_impl = latency_winner
|
161 |
+
print(f"\n{faster_impl} is {speed_ratio:.2f}x faster")
|
162 |
+
|
163 |
+
# Throughput ratio
|
164 |
+
throughput_ratio = max(throughputs) / min(throughputs)
|
165 |
+
higher_throughput = throughput_winner
|
166 |
+
print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput")
|
167 |
+
|
168 |
+
print("="*60)
|
index.html
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
small_compare.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|