drbh HF Staff commited on
Commit
bad4ddc
·
verified ·
1 Parent(s): 2ccb26f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ artifacts/visualization/small_moe_comparison.png filter=lfs diff=lfs merge=lfs -text
artifacts/gptoss_run/gptoss_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "avg_time_ms": 62.308485079556704,
3
+ "throughput_tokens_per_sec": 65737.43519474348,
4
+ "memory_allocated_gb": 1.329831600189209,
5
+ "memory_cached_gb": 1.8359375,
6
+ "memory_increase_gb": 0.3795137405395508,
7
+ "device": "cuda",
8
+ "dtype": "torch.bfloat16",
9
+ "tokens": 4096,
10
+ "warmup_iters": 10,
11
+ "timing_iters": 50
12
+ }
artifacts/megablocks_run/megablocks_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "avg_time_ms": 26.93254135781899,
3
+ "throughput_tokens_per_sec": 152083.67994618745,
4
+ "memory_allocated_gb": 2.2425241470336914,
5
+ "memory_cached_gb": 4.14453125,
6
+ "memory_increase_gb": 1.2922062873840332,
7
+ "device": "cuda",
8
+ "dtype": "torch.bfloat16",
9
+ "tokens": 4096,
10
+ "warmup_iters": 10,
11
+ "timing_iters": 50
12
+ }
artifacts/visualization/small_moe_comparison.png ADDED

Git LFS Details

  • SHA256: b6a3d5739c3a803dd8a6be8ec66918e915e8d7fa1d5b876dc77f30e871d6b0d0
  • Pointer size: 131 Bytes
  • Size of remote file: 122 kB
cells/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.03 kB). View file
 
cells/__pycache__/utils.cpython-312.pyc ADDED
Binary file (7.31 kB). View file
 
cells/config.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Configuration for MoE benchmarks."""
9
+ import torch
10
+
11
+ # Model configuration
12
+ NUM_EXPERTS = 128
13
+ HIDDEN_SIZE = 1152
14
+ TOP_K = 4
15
+
16
+ # Benchmark configuration
17
+ BATCH_SIZE = 8
18
+ SEQ_LEN = 512
19
+ DTYPE = "bfloat16"
20
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ # Seeds for reproducibility
23
+ WEIGHT_SEED = 999
24
+ EXPERT_SEED = 777
25
+ INPUT_SEED = 123
26
+ GENERAL_SEED = 42
27
+
28
+ print(f"Configuration:")
29
+ print(f" Experts: {NUM_EXPERTS}")
30
+ print(f" Hidden size: {HIDDEN_SIZE}")
31
+ print(f" Top-k: {TOP_K}")
32
+ print(f" Batch size: {BATCH_SIZE}")
33
+ print(f" Sequence length: {SEQ_LEN}")
34
+ print(f" Device: {DEVICE}")
35
+ print(f" Dtype: {DTYPE}")
cells/gptoss_run.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import functional as F
11
+ from utils import to_dtype, tensor_stats, set_seed, bench_context
12
+ from config import (
13
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
14
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
15
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
16
+ )
17
+ from pathlib import Path
18
+ import os
19
+
20
+ # Discover the upstream artifact directory from env
21
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
22
+
23
+ # list all the files in the directory
24
+ print(f"Loading weights from: {data_dir}")
25
+ print(f"Files in directory: {list(Path(data_dir).glob('*'))}")
26
+
27
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
28
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
29
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
30
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
31
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
32
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
33
+
34
+ print("Loaded shared weights from artifacts")
35
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
36
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
37
+ print(f"Down sum: {down_proj.sum().item():.6f}")
38
+
39
+ class GptOssRouter(nn.Module):
40
+ def __init__(self, router_weight, router_bias):
41
+ super().__init__()
42
+ self.top_k = TOP_K
43
+ self.num_experts = NUM_EXPERTS
44
+ self.hidden_dim = HIDDEN_SIZE
45
+ self.weight = nn.Parameter(router_weight.clone())
46
+ self.bias = nn.Parameter(router_bias.clone())
47
+
48
+ def forward(self, hidden_states):
49
+ hidden_states = hidden_states.reshape(-1, self.hidden_dim)
50
+ router_logits = F.linear(hidden_states, self.weight, self.bias)
51
+ router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
52
+ router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
53
+ router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
54
+ return router_scores, router_indices
55
+
56
+ class GptOssExperts(nn.Module):
57
+ def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
58
+ super().__init__()
59
+ self.num_experts = NUM_EXPERTS
60
+ self.hidden_size = HIDDEN_SIZE
61
+ self.expert_dim = self.hidden_size
62
+ self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
63
+ self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
64
+ self.down_proj = nn.Parameter(down_proj.clone())
65
+ self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
66
+ self.alpha = 1.702
67
+ self.limit = 7.0
68
+
69
+ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
70
+ batch_size = hidden_states.shape[0]
71
+ hidden_states = hidden_states.reshape(-1, self.hidden_size)
72
+ num_experts = routing_weights.shape[1]
73
+
74
+ if hidden_states.device.type == "cpu" or self.training:
75
+ next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
76
+ with torch.no_grad():
77
+ expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
78
+ expert_mask = expert_mask.permute(2, 1, 0)
79
+ expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
80
+
81
+ for expert_idx in expert_hit[:]:
82
+ expert_idx = expert_idx[0]
83
+ with torch.no_grad():
84
+ _, token_idx = torch.where(expert_mask[expert_idx])
85
+ current_state = hidden_states[token_idx]
86
+ gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
87
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
88
+ gate = gate.clamp(min=None, max=self.limit)
89
+ up = up.clamp(min=-self.limit, max=self.limit)
90
+ glu = gate * torch.sigmoid(gate * self.alpha)
91
+ gated_output = (up + 1) * glu
92
+ out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
93
+ weighted_output = out * routing_weights[token_idx, expert_idx, None]
94
+ next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
95
+ next_states = next_states.view(batch_size, -1, self.hidden_size)
96
+ else:
97
+ hidden_states = hidden_states.repeat(num_experts, 1)
98
+ hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
99
+ gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
100
+ gate, up = gate_up[..., ::2], gate_up[..., 1::2]
101
+ gate = gate.clamp(min=None, max=self.limit)
102
+ up = up.clamp(min=-self.limit, max=self.limit)
103
+ glu = gate * torch.sigmoid(gate * self.alpha)
104
+ next_states = torch.bmm(((up + 1) * glu), self.down_proj)
105
+ next_states = next_states + self.down_proj_bias[..., None, :]
106
+ next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
107
+ next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
108
+ next_states = next_states.sum(dim=0)
109
+ return next_states
110
+
111
+ class GptOssMoEMLP(nn.Module):
112
+ def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
113
+ super().__init__()
114
+ self.router = GptOssRouter(router_weight, router_bias)
115
+ self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
116
+
117
+ def forward(self, hidden_states):
118
+ router_scores, router_indices = self.router(hidden_states)
119
+ routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
120
+ return routed_out, router_scores
121
+
122
+ # Run the model
123
+ set_seed(GENERAL_SEED)
124
+
125
+ device = torch.device(DEVICE)
126
+ dtype = to_dtype(DTYPE)
127
+
128
+ print("\n=== GPT-OSS Implementation ===")
129
+ # Initialize model with loaded weights
130
+ model = GptOssMoEMLP(
131
+ router_weight.to(device, dtype=dtype),
132
+ router_bias.to(device, dtype=dtype),
133
+ gate_up_proj.to(device, dtype=dtype),
134
+ gate_up_proj_bias.to(device, dtype=dtype),
135
+ down_proj.to(device, dtype=dtype),
136
+ down_proj_bias.to(device, dtype=dtype)
137
+ ).to(device=device, dtype=dtype)
138
+
139
+ print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
140
+ print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
141
+ print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
142
+
143
+ # Benchmark the model using different input tensors on each iteration
144
+ tokens = BATCH_SIZE * SEQ_LEN
145
+ input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
146
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
147
+ save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
148
+ output, stats = bench(model)
149
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
cells/megablocks_run.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # "kernels",
6
+ # ]
7
+ # ///
8
+
9
+ import torch
10
+ from torch import nn
11
+ from torch.nn import functional as F
12
+ from kernels import get_kernel, get_local_kernel
13
+ from utils import to_dtype, tensor_stats, set_seed, bench_context
14
+ from config import (
15
+ NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
16
+ BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
17
+ WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
18
+ )
19
+ from pathlib import Path
20
+ from collections import namedtuple
21
+ import os
22
+
23
+ # Discover the upstream artifact directory from env
24
+ data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
25
+
26
+ print(f"Loading weights from: {data_dir}")
27
+
28
+ router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
29
+ router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
30
+ gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
31
+ gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
32
+ down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
33
+ down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
34
+
35
+ print("Loaded shared weights from artifacts")
36
+ print(f"Router weight sum: {router_weight.sum().item():.6f}")
37
+ print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
38
+ print(f"Down sum: {down_proj.sum().item():.6f}")
39
+
40
+ def build_megablocks_model(device: torch.device, dtype: torch.dtype):
41
+ # Download optimized kernels from the Hugging Face hub
42
+ megablocks = get_kernel("kernels-community/megablocks")
43
+
44
+ # megablocks = get_local_kernel(
45
+ # Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks")
46
+
47
+ model = megablocks.layers.MegaBlocksMoeMLP()
48
+
49
+ # Create attribute container for expert weights
50
+ model.experts = namedtuple(
51
+ "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
52
+ )
53
+
54
+ # Use loaded router weights for consistency
55
+ model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype)
56
+ with torch.no_grad():
57
+ model.router.weight.copy_(router_weight.to(dtype))
58
+ model.router.bias.copy_(router_bias.to(dtype))
59
+
60
+ # Attach loaded expert weights to the experts container
61
+ e = model.experts
62
+ e.alpha = 1.702
63
+ e.capacity_factor = 4
64
+ e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype))
65
+ e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype))
66
+ e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype))
67
+ e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype))
68
+ e.hidden_size = HIDDEN_SIZE
69
+
70
+ # Log weight statistics for comparison
71
+ print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
72
+ print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
73
+ print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
74
+
75
+ return model
76
+
77
+ # Create a wrapper to match the interface of other implementations
78
+ class MegaBlocksMoEWrapper(nn.Module):
79
+ def __init__(self, megablocks_model):
80
+ super().__init__()
81
+ self.model = megablocks_model
82
+
83
+ def forward(self, hidden_states):
84
+ # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
85
+ output, dummy_routing_weights = self.model(hidden_states)
86
+ # Return output and dummy routing weights for consistency with other implementations
87
+ # dummy_routing_weights = torch.zeros(
88
+ # hidden_states.shape[0] * hidden_states.shape[1],
89
+ # NUM_EXPERTS,
90
+ # device=hidden_states.device,
91
+ # dtype=hidden_states.dtype
92
+ # )
93
+ return output, dummy_routing_weights
94
+
95
+ # Run the model
96
+ set_seed(GENERAL_SEED)
97
+
98
+ device = torch.device(DEVICE)
99
+ dtype = to_dtype(DTYPE)
100
+
101
+ print("\n=== MegaBlocks Implementation ===")
102
+ # Build MegaBlocks model with loaded weights
103
+ megablocks_model = build_megablocks_model(device, dtype)
104
+ model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype)
105
+
106
+ # Benchmark the model using different input tensors on each iteration
107
+ tokens = BATCH_SIZE * SEQ_LEN
108
+ input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
109
+ with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
110
+ save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
111
+ output, stats = bench(model)
112
+ print(f"\nOutput sum: {output[0].sum().item():.6f}")
cells/nvidia_dump.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # ]
5
+ # ///
6
+
7
+ """Utility to dump NVIDIA GPU information."""
8
+ import subprocess
9
+
10
+ def nvidia_dump():
11
+ """Dump NVIDIA GPU information."""
12
+ try:
13
+ result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True)
14
+ print("NVIDIA GPU Information:")
15
+ print(result.stdout)
16
+ except FileNotFoundError:
17
+ print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?")
18
+ except subprocess.CalledProcessError as e:
19
+ print(f"Error running nvidia-smi: {e}")
20
+
21
+ nvidia_dump()
cells/save_data.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Generate and save shared weights for consistent comparison."""
9
+ import torch
10
+ import numpy as np
11
+ from pathlib import Path
12
+
13
+ # Model configuration
14
+ NUM_EXPERTS = 128
15
+ HIDDEN_SIZE = 1152
16
+ INTERMEDIATE_SIZE = 3072
17
+ TOP_K = 4
18
+
19
+ # Input configuration
20
+ BATCH_SIZE = 1
21
+ SEQ_LEN = 100
22
+ DTYPE = "float32"
23
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
+
25
+ # Seeds for reproducibility
26
+ WEIGHT_SEED = 999
27
+ EXPERT_SEED = 777
28
+ INPUT_SEED = 123
29
+ GENERAL_SEED = 42
30
+
31
+ def set_seed(seed: int):
32
+ """Set seeds for reproducibility."""
33
+ torch.manual_seed(seed)
34
+ np.random.seed(seed)
35
+ if torch.cuda.is_available():
36
+ torch.cuda.manual_seed(seed)
37
+ torch.cuda.manual_seed_all(seed)
38
+
39
+ # Generate shared weights for all implementations
40
+ print("Generating shared weights...")
41
+
42
+ # Router weights
43
+ set_seed(WEIGHT_SEED)
44
+ router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
45
+ torch.nn.init.kaiming_uniform_(router_weight)
46
+ router_bias = torch.zeros(NUM_EXPERTS)
47
+
48
+ # Expert weights - using proper dimensions for gate/up combined projection
49
+ set_seed(EXPERT_SEED)
50
+ gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
51
+ gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
52
+ down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
53
+ down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
54
+
55
+ # Save weights
56
+ torch.save(router_weight, 'router_weight.pt')
57
+ torch.save(router_bias, 'router_bias.pt')
58
+ torch.save(gate_up_proj, 'gate_up_proj.pt')
59
+ torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
60
+ torch.save(down_proj, 'down_proj.pt')
61
+ torch.save(down_proj_bias, 'down_proj_bias.pt')
62
+
63
+ print(f"Saved weights:")
64
+ print(f" Router: {tuple(router_weight.shape)}")
65
+ print(f" Gate/Up proj: {tuple(gate_up_proj.shape)}")
66
+ print(f" Down proj: {tuple(down_proj.shape)}")
67
+ print(f" Hidden size: {HIDDEN_SIZE}")
cells/utils.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "torch",
4
+ # "numpy",
5
+ # ]
6
+ # ///
7
+
8
+ """Simple utilities for running the models."""
9
+ import torch
10
+
11
+ def to_dtype(dtype_str: str):
12
+ """Convert string to torch dtype."""
13
+ if dtype_str == "float16":
14
+ return torch.float16
15
+ if dtype_str == "bfloat16":
16
+ return torch.bfloat16
17
+ return torch.float32
18
+
19
+ def tensor_stats(t: torch.Tensor) -> str:
20
+ """Generate stats string for a tensor."""
21
+ return (f"shape={tuple(t.shape)}, "
22
+ f"dtype={t.dtype}, "
23
+ f"device={t.device}, "
24
+ f"mean={t.mean().item():.6f}, "
25
+ f"std={t.std().item():.6f}")
26
+
27
+ def set_seed(seed: int):
28
+ """Set seeds for reproducibility."""
29
+ torch.manual_seed(seed)
30
+ if torch.cuda.is_available():
31
+ torch.cuda.manual_seed(seed)
32
+ torch.cuda.manual_seed_all(seed)
33
+ torch.backends.cudnn.deterministic = True
34
+ torch.backends.cudnn.benchmark = False
35
+
36
+ """Reusable benchmarking utilities for performance testing."""
37
+ import time
38
+ import numpy as np
39
+ from contextlib import contextmanager
40
+ from typing import Callable, Dict, Tuple, Any, Optional
41
+ import torch
42
+ import json
43
+
44
+ def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20,
45
+ input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
46
+ """High precision timing function with warmup and optional input generation per iteration."""
47
+ # Warmup
48
+ for i in range(warmup):
49
+ if input_generator:
50
+ inputs = input_generator(i)
51
+ func(inputs)
52
+ else:
53
+ func()
54
+
55
+ if torch.cuda.is_available():
56
+ torch.cuda.synchronize()
57
+
58
+ start = time.perf_counter()
59
+ result = None
60
+ for i in range(iters):
61
+ if input_generator:
62
+ inputs = input_generator(i + warmup) # Continue seed sequence after warmup
63
+ result = func(inputs)
64
+ else:
65
+ result = func()
66
+
67
+ if torch.cuda.is_available():
68
+ torch.cuda.synchronize()
69
+
70
+ end = time.perf_counter()
71
+ avg_time = (end - start) / iters
72
+ return result, avg_time
73
+
74
+ def memory_usage() -> Dict[str, float]:
75
+ """Get current memory usage in GB."""
76
+ if not torch.cuda.is_available():
77
+ return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}
78
+
79
+ return {
80
+ "allocated": torch.cuda.memory_allocated() / 1024**3,
81
+ "cached": torch.cuda.memory_reserved() / 1024**3,
82
+ "max_allocated": torch.cuda.max_memory_allocated() / 1024**3
83
+ }
84
+
85
+ @contextmanager
86
+ def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None,
87
+ tokens: int = None, save_json: Optional[str] = None,
88
+ input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
89
+ """Context manager for benchmarking with comprehensive metrics and optional input generation."""
90
+
91
+ def run_benchmark(model_func, *args, **kwargs):
92
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
93
+
94
+ mem_before = memory_usage()
95
+
96
+ # Create input generator if input_shape is provided
97
+ input_generator = None
98
+ if input_shape is not None:
99
+ def create_input(iteration: int):
100
+ # Use deterministic but different seed for each iteration
101
+ iteration_seed = input_seed_base + iteration * 123 # Spread out seeds
102
+ torch.manual_seed(iteration_seed)
103
+ if torch.cuda.is_available():
104
+ torch.cuda.manual_seed(iteration_seed)
105
+ return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
106
+ input_generator = create_input
107
+
108
+ if input_generator:
109
+ result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
110
+ else:
111
+ result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)
112
+
113
+ mem_after = memory_usage()
114
+
115
+ # Calculate metrics
116
+ metrics = {
117
+ "avg_time_ms": avg_time * 1000,
118
+ "throughput_tokens_per_sec": tokens / avg_time if tokens else None,
119
+ "memory_allocated_gb": mem_after["allocated"],
120
+ "memory_cached_gb": mem_after["cached"],
121
+ "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
122
+ "device": str(device) if device else "cpu",
123
+ "dtype": str(dtype) if dtype else "float32",
124
+ "tokens": tokens,
125
+ "warmup_iters": warmup,
126
+ "timing_iters": iters
127
+ }
128
+
129
+ # Print results
130
+ print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
131
+ if tokens:
132
+ print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
133
+ print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
134
+ print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")
135
+
136
+ # Save to JSON if requested
137
+ if save_json:
138
+ with open(save_json, 'w') as f:
139
+ json.dump(metrics, f, indent=2)
140
+
141
+ return result
142
+
143
+ yield run_benchmark
cells/visualization.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # dependencies = [
3
+ # "matplotlib",
4
+ # ]
5
+ # ///
6
+
7
+ import json
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from pathlib import Path
11
+ import os
12
+
13
+ # Get result directories from environment variables
14
+ gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.')
15
+ megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.')
16
+
17
+ print(f"Loading benchmark results from:")
18
+ print(f" GPT-OSS dir: {gptoss_dir}")
19
+ print(f" MegaBlocks dir: {megablocks_dir}")
20
+
21
+ # Load benchmark results
22
+ gptoss_file = Path(gptoss_dir) / 'gptoss_results.json'
23
+ megablocks_file = Path(megablocks_dir) / 'megablocks_results.json'
24
+
25
+ print(f"Loading results from:")
26
+ print(f" GPT-OSS: {gptoss_file}")
27
+ print(f" MegaBlocks: {megablocks_file}")
28
+
29
+ if not gptoss_file.exists():
30
+ print(f"Warning: {gptoss_file} not found")
31
+ if not megablocks_file.exists():
32
+ print(f"Warning: {megablocks_file} not found")
33
+
34
+ with open(gptoss_file, 'r') as f:
35
+ gptoss_results = json.load(f)
36
+
37
+ with open(megablocks_file, 'r') as f:
38
+ megablocks_results = json.load(f)
39
+
40
+ print(f"GPT-OSS results keys: {list(gptoss_results.keys())}")
41
+ print(f"MegaBlocks results keys: {list(megablocks_results.keys())}")
42
+
43
+ # Helper function to extract metrics from either old or new JSON format
44
+ def get_metric(results, metric_name, default=0):
45
+ """Extract metric from results, handling both old and new JSON formats."""
46
+ # New format (with stats dict)
47
+ if 'stats' in results:
48
+ return results['stats'].get(metric_name, default)
49
+ # Old format (direct keys)
50
+ elif metric_name in results:
51
+ return results[metric_name]
52
+ else:
53
+ return default
54
+
55
+ # Create comparison plots
56
+ fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
57
+
58
+ # Performance comparison
59
+ implementations = ['GPT-OSS', 'MegaBlocks']
60
+
61
+ # Extract timing metrics (handle both avg_ms and avg_time_ms)
62
+ gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0))
63
+ mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0))
64
+ times = [gpt_time, mega_time]
65
+
66
+ # Extract throughput metrics
67
+ gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0))
68
+ mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0))
69
+ throughputs = [gpt_throughput, mega_throughput]
70
+
71
+ # Extract memory metrics
72
+ gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0)
73
+ mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0)
74
+ memory_usage = [gpt_memory, mega_memory]
75
+
76
+ gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0)
77
+ mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0)
78
+ memory_increase = [gpt_mem_inc, mega_mem_inc]
79
+
80
+ print(f"Extracted metrics:")
81
+ print(f" Times (ms): {times}")
82
+ print(f" Throughputs: {throughputs}")
83
+ print(f" Memory usage (GB): {memory_usage}")
84
+ print(f" Memory increase (GB): {memory_increase}")
85
+
86
+ colors = ['#2E8B57', '#4169E1']
87
+
88
+ # Latency comparison
89
+ bars1 = ax1.bar(implementations, times, color=colors)
90
+ ax1.set_ylabel('Average Time (ms)')
91
+ ax1.set_title('Latency Comparison')
92
+ ax1.grid(True, alpha=0.3)
93
+
94
+ # Add values on bars
95
+ for bar, time in zip(bars1, times):
96
+ height = bar.get_height()
97
+ ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
98
+ f'{time:.2f}ms', ha='center', va='bottom')
99
+
100
+ # Throughput comparison
101
+ bars2 = ax2.bar(implementations, throughputs, color=colors)
102
+ ax2.set_ylabel('Tokens per Second')
103
+ ax2.set_title('Throughput Comparison')
104
+ ax2.grid(True, alpha=0.3)
105
+
106
+ # Add values on bars
107
+ for bar, throughput in zip(bars2, throughputs):
108
+ height = bar.get_height()
109
+ ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
110
+ f'{throughput:.0f}', ha='center', va='bottom')
111
+
112
+ # Memory usage comparison
113
+ bars3 = ax3.bar(implementations, memory_usage, color=colors)
114
+ ax3.set_ylabel('Memory Allocated (GB)')
115
+ ax3.set_title('Memory Usage Comparison')
116
+ ax3.grid(True, alpha=0.3)
117
+
118
+ # Add values on bars
119
+ for bar, mem in zip(bars3, memory_usage):
120
+ height = bar.get_height()
121
+ ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
122
+ f'{mem:.2f}GB', ha='center', va='bottom')
123
+
124
+ # Memory increase comparison
125
+ bars4 = ax4.bar(implementations, memory_increase, color=colors)
126
+ ax4.set_ylabel('Memory Increase (GB)')
127
+ ax4.set_title('Memory Increase Comparison')
128
+ ax4.grid(True, alpha=0.3)
129
+
130
+ # Add values on bars
131
+ for bar, mem_inc in zip(bars4, memory_increase):
132
+ height = bar.get_height()
133
+ ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
134
+ f'{mem_inc:.3f}GB', ha='center', va='bottom')
135
+
136
+ plt.tight_layout()
137
+ plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight')
138
+ plt.show()
139
+
140
+ # Print summary table
141
+ print("\n" + "="*60)
142
+ print("PERFORMANCE COMPARISON SUMMARY")
143
+ print("="*60)
144
+ print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}")
145
+ print("-" * 60)
146
+
147
+ # Determine winners
148
+ latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks"
149
+ throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks"
150
+ memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks"
151
+ mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks"
152
+
153
+ print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}")
154
+ print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}")
155
+ print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}")
156
+ print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}")
157
+
158
+ # Speed ratio
159
+ speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1]
160
+ faster_impl = latency_winner
161
+ print(f"\n{faster_impl} is {speed_ratio:.2f}x faster")
162
+
163
+ # Throughput ratio
164
+ throughput_ratio = max(throughputs) / min(throughputs)
165
+ higher_throughput = throughput_winner
166
+ print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput")
167
+
168
+ print("="*60)
index.html CHANGED
The diff for this file is too large to render. See raw diff
 
small_compare.html ADDED
The diff for this file is too large to render. See raw diff