Spaces:

drbh
/

compare-moe-uvnote

Running

App Files Files Community

drbh HF Staff commited on Sep 15

Commit

bad4ddc

verified ·

1 Parent(s): 2ccb26f

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +1 -0
artifacts/gptoss_run/gptoss_results.json +12 -0
artifacts/megablocks_run/megablocks_results.json +12 -0
artifacts/visualization/small_moe_comparison.png +3 -0
cells/__pycache__/config.cpython-312.pyc +0 -0
cells/__pycache__/utils.cpython-312.pyc +0 -0
cells/config.py +35 -0
cells/gptoss_run.py +149 -0
cells/megablocks_run.py +112 -0
cells/nvidia_dump.py +21 -0
cells/save_data.py +67 -0
cells/utils.py +143 -0
cells/visualization.py +168 -0
index.html +0 -0
small_compare.html +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+artifacts/visualization/small_moe_comparison.png filter=lfs diff=lfs merge=lfs -text

artifacts/gptoss_run/gptoss_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "avg_time_ms": 62.308485079556704,
+  "throughput_tokens_per_sec": 65737.43519474348,
+  "memory_allocated_gb": 1.329831600189209,
+  "memory_cached_gb": 1.8359375,
+  "memory_increase_gb": 0.3795137405395508,
+  "device": "cuda",
+  "dtype": "torch.bfloat16",
+  "tokens": 4096,
+  "warmup_iters": 10,
+  "timing_iters": 50
+}

artifacts/megablocks_run/megablocks_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "avg_time_ms": 26.93254135781899,
+  "throughput_tokens_per_sec": 152083.67994618745,
+  "memory_allocated_gb": 2.2425241470336914,
+  "memory_cached_gb": 4.14453125,
+  "memory_increase_gb": 1.2922062873840332,
+  "device": "cuda",
+  "dtype": "torch.bfloat16",
+  "tokens": 4096,
+  "warmup_iters": 10,
+  "timing_iters": 50
+}

artifacts/visualization/small_moe_comparison.png ADDED Viewed

Git LFS Details

SHA256: b6a3d5739c3a803dd8a6be8ec66918e915e8d7fa1d5b876dc77f30e871d6b0d0
Pointer size: 131 Bytes
Size of remote file: 122 kB

cells/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.03 kB). View file

cells/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7.31 kB). View file

cells/config.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+"""Configuration for MoE benchmarks."""
+import torch
+# Model configuration
+NUM_EXPERTS = 128
+HIDDEN_SIZE = 1152
+TOP_K = 4
+# Benchmark configuration
+BATCH_SIZE = 8
+SEQ_LEN = 512
+DTYPE = "bfloat16"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Seeds for reproducibility
+WEIGHT_SEED = 999
+EXPERT_SEED = 777
+INPUT_SEED = 123
+GENERAL_SEED = 42
+print(f"Configuration:")
+print(f"  Experts: {NUM_EXPERTS}")
+print(f"  Hidden size: {HIDDEN_SIZE}")
+print(f"  Top-k: {TOP_K}")
+print(f"  Batch size: {BATCH_SIZE}")
+print(f"  Sequence length: {SEQ_LEN}")
+print(f"  Device: {DEVICE}")
+print(f"  Dtype: {DTYPE}")

cells/gptoss_run.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+import torch
+from torch import nn
+from torch.nn import functional as F
+from utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+import os
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+# list all the files in the directory
+print(f"Loading weights from: {data_dir}")
+print(f"Files in directory: {list(Path(data_dir).glob('*'))}")
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+class GptOssRouter(nn.Module):
+    def __init__(self, router_weight, router_bias):
+        super().__init__()
+        self.top_k = TOP_K
+        self.num_experts = NUM_EXPERTS
+        self.hidden_dim = HIDDEN_SIZE
+        self.weight = nn.Parameter(router_weight.clone())
+        self.bias = nn.Parameter(router_bias.clone())
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
+        router_logits = F.linear(hidden_states, self.weight, self.bias)
+        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
+        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
+        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
+        return router_scores, router_indices
+class GptOssExperts(nn.Module):
+    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.num_experts = NUM_EXPERTS
+        self.hidden_size = HIDDEN_SIZE
+        self.expert_dim = self.hidden_size
+        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
+        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
+        self.down_proj = nn.Parameter(down_proj.clone())
+        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
+        self.alpha = 1.702
+        self.limit = 7.0
+    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
+        batch_size = hidden_states.shape[0]
+        hidden_states = hidden_states.reshape(-1, self.hidden_size)
+        num_experts = routing_weights.shape[1]
+        if hidden_states.device.type == "cpu" or self.training:
+            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
+            with torch.no_grad():
+                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
+                expert_mask = expert_mask.permute(2, 1, 0)
+                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit[:]:
+                expert_idx = expert_idx[0]
+                with torch.no_grad():
+                    _, token_idx = torch.where(expert_mask[expert_idx])
+                current_state = hidden_states[token_idx]
+                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
+                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                gate = gate.clamp(min=None, max=self.limit)
+                up = up.clamp(min=-self.limit, max=self.limit)
+                glu = gate * torch.sigmoid(gate * self.alpha)
+                gated_output = (up + 1) * glu
+                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
+                weighted_output = out * routing_weights[token_idx, expert_idx, None]
+                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
+            next_states = next_states.view(batch_size, -1, self.hidden_size)
+        else:
+            hidden_states = hidden_states.repeat(num_experts, 1)
+            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
+            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
+            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+            gate = gate.clamp(min=None, max=self.limit)
+            up = up.clamp(min=-self.limit, max=self.limit)
+            glu = gate * torch.sigmoid(gate * self.alpha)
+            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+            next_states = next_states + self.down_proj_bias[..., None, :]
+            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
+            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+            next_states = next_states.sum(dim=0)
+        return next_states
+class GptOssMoEMLP(nn.Module):
+    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
+        super().__init__()
+        self.router = GptOssRouter(router_weight, router_bias)
+        self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)
+    def forward(self, hidden_states):
+        router_scores, router_indices = self.router(hidden_states)
+        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
+        return routed_out, router_scores
+# Run the model
+set_seed(GENERAL_SEED)
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+print("\n=== GPT-OSS Implementation ===")
+# Initialize model with loaded weights
+model = GptOssMoEMLP(
+    router_weight.to(device, dtype=dtype),
+    router_bias.to(device, dtype=dtype),
+    gate_up_proj.to(device, dtype=dtype),
+    gate_up_proj_bias.to(device, dtype=dtype),
+    down_proj.to(device, dtype=dtype),
+    down_proj_bias.to(device, dtype=dtype)
+).to(device=device, dtype=dtype)
+print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
+print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
+print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")
+# Benchmark the model using different input tensors on each iteration
+tokens = BATCH_SIZE * SEQ_LEN
+input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
+                   save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
+    output, stats = bench(model)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")

cells/megablocks_run.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+#     "kernels",
+# ]
+# ///
+import torch
+from torch import nn
+from torch.nn import functional as F
+from kernels import get_kernel, get_local_kernel
+from utils import to_dtype, tensor_stats, set_seed, bench_context
+from config import (
+    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
+    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
+    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
+)
+from pathlib import Path
+from collections import namedtuple
+import os
+# Discover the upstream artifact directory from env
+data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')
+print(f"Loading weights from: {data_dir}")
+router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
+router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
+gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
+gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
+down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
+down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')
+print("Loaded shared weights from artifacts")
+print(f"Router weight sum: {router_weight.sum().item():.6f}")
+print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
+print(f"Down sum: {down_proj.sum().item():.6f}")
+def build_megablocks_model(device: torch.device, dtype: torch.dtype):
+    # Download optimized kernels from the Hugging Face hub
+    megablocks = get_kernel("kernels-community/megablocks")
+    # megablocks = get_local_kernel(
+    #     Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks")
+    model = megablocks.layers.MegaBlocksMoeMLP()
+    # Create attribute container for expert weights
+    model.experts = namedtuple(
+        "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
+    )
+    # Use loaded router weights for consistency
+    model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype)
+    with torch.no_grad():
+        model.router.weight.copy_(router_weight.to(dtype))
+        model.router.bias.copy_(router_bias.to(dtype))
+    # Attach loaded expert weights to the experts container
+    e = model.experts
+    e.alpha = 1.702
+    e.capacity_factor = 4
+    e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype))
+    e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype))
+    e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype))
+    e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype))
+    e.hidden_size = HIDDEN_SIZE
+    # Log weight statistics for comparison
+    print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
+    print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
+    print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")
+    return model
+# Create a wrapper to match the interface of other implementations
+class MegaBlocksMoEWrapper(nn.Module):
+    def __init__(self, megablocks_model):
+        super().__init__()
+        self.model = megablocks_model
+    def forward(self, hidden_states):
+        # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
+        output, dummy_routing_weights = self.model(hidden_states)
+        # Return output and dummy routing weights for consistency with other implementations
+        # dummy_routing_weights = torch.zeros(
+        #     hidden_states.shape[0] * hidden_states.shape[1],
+        #     NUM_EXPERTS,
+        #     device=hidden_states.device,
+        #     dtype=hidden_states.dtype
+        # )
+        return output, dummy_routing_weights
+# Run the model
+set_seed(GENERAL_SEED)
+device = torch.device(DEVICE)
+dtype = to_dtype(DTYPE)
+print("\n=== MegaBlocks Implementation ===")
+# Build MegaBlocks model with loaded weights
+megablocks_model = build_megablocks_model(device, dtype)
+model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype)
+# Benchmark the model using different input tensors on each iteration
+tokens = BATCH_SIZE * SEQ_LEN
+input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
+with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens,
+                   save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
+    output, stats = bench(model)
+    print(f"\nOutput sum: {output[0].sum().item():.6f}")

cells/nvidia_dump.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# /// script
+# dependencies = [
+#     "torch",
+# ]
+# ///
+"""Utility to dump NVIDIA GPU information."""
+import subprocess
+def nvidia_dump():
+    """Dump NVIDIA GPU information."""
+    try:
+        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True)
+        print("NVIDIA GPU Information:")
+        print(result.stdout)
+    except FileNotFoundError:
+        print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?")
+    except subprocess.CalledProcessError as e:
+        print(f"Error running nvidia-smi: {e}")
+nvidia_dump()

cells/save_data.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+"""Generate and save shared weights for consistent comparison."""
+import torch
+import numpy as np
+from pathlib import Path
+# Model configuration
+NUM_EXPERTS = 128
+HIDDEN_SIZE = 1152
+INTERMEDIATE_SIZE = 3072
+TOP_K = 4
+# Input configuration
+BATCH_SIZE = 1
+SEQ_LEN = 100
+DTYPE = "float32"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Seeds for reproducibility
+WEIGHT_SEED = 999
+EXPERT_SEED = 777
+INPUT_SEED = 123
+GENERAL_SEED = 42
+def set_seed(seed: int):
+    """Set seeds for reproducibility."""
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+# Generate shared weights for all implementations
+print("Generating shared weights...")
+# Router weights
+set_seed(WEIGHT_SEED)
+router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
+torch.nn.init.kaiming_uniform_(router_weight)
+router_bias = torch.zeros(NUM_EXPERTS)
+# Expert weights - using proper dimensions for gate/up combined projection
+set_seed(EXPERT_SEED)
+gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
+gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
+down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
+down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)
+# Save weights
+torch.save(router_weight, 'router_weight.pt')
+torch.save(router_bias, 'router_bias.pt')
+torch.save(gate_up_proj, 'gate_up_proj.pt')
+torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
+torch.save(down_proj, 'down_proj.pt')
+torch.save(down_proj_bias, 'down_proj_bias.pt')
+print(f"Saved weights:")
+print(f"  Router: {tuple(router_weight.shape)}")
+print(f"  Gate/Up proj: {tuple(gate_up_proj.shape)}")
+print(f"  Down proj: {tuple(down_proj.shape)}")
+print(f"  Hidden size: {HIDDEN_SIZE}")

cells/utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# /// script
+# dependencies = [
+#     "torch",
+#     "numpy",
+# ]
+# ///
+"""Simple utilities for running the models."""
+import torch
+def to_dtype(dtype_str: str):
+    """Convert string to torch dtype."""
+    if dtype_str == "float16":
+        return torch.float16
+    if dtype_str == "bfloat16":
+        return torch.bfloat16
+    return torch.float32
+def tensor_stats(t: torch.Tensor) -> str:
+    """Generate stats string for a tensor."""
+    return (f"shape={tuple(t.shape)}, "
+            f"dtype={t.dtype}, "
+            f"device={t.device}, "
+            f"mean={t.mean().item():.6f}, "
+            f"std={t.std().item():.6f}")
+def set_seed(seed: int):
+    """Set seeds for reproducibility."""
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+"""Reusable benchmarking utilities for performance testing."""
+import time
+import numpy as np
+from contextlib import contextmanager
+from typing import Callable, Dict, Tuple, Any, Optional
+import torch
+import json
+def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20,
+                   input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
+    """High precision timing function with warmup and optional input generation per iteration."""
+    # Warmup
+    for i in range(warmup):
+        if input_generator:
+            inputs = input_generator(i)
+            func(inputs)
+        else:
+            func()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    start = time.perf_counter()
+    result = None
+    for i in range(iters):
+        if input_generator:
+            inputs = input_generator(i + warmup)  # Continue seed sequence after warmup
+            result = func(inputs)
+        else:
+            result = func()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    end = time.perf_counter()
+    avg_time = (end - start) / iters
+    return result, avg_time
+def memory_usage() -> Dict[str, float]:
+    """Get current memory usage in GB."""
+    if not torch.cuda.is_available():
+        return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}
+    return {
+        "allocated": torch.cuda.memory_allocated() / 1024**3,
+        "cached": torch.cuda.memory_reserved() / 1024**3,
+        "max_allocated": torch.cuda.max_memory_allocated() / 1024**3
+    }
+@contextmanager
+def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None,
+                  tokens: int = None, save_json: Optional[str] = None,
+                  input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
+    """Context manager for benchmarking with comprehensive metrics and optional input generation."""
+    def run_benchmark(model_func, *args, **kwargs):
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        mem_before = memory_usage()
+        # Create input generator if input_shape is provided
+        input_generator = None
+        if input_shape is not None:
+            def create_input(iteration: int):
+                # Use deterministic but different seed for each iteration
+                iteration_seed = input_seed_base + iteration * 123  # Spread out seeds
+                torch.manual_seed(iteration_seed)
+                if torch.cuda.is_available():
+                    torch.cuda.manual_seed(iteration_seed)
+                return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
+            input_generator = create_input
+        if input_generator:
+            result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
+        else:
+            result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)
+        mem_after = memory_usage()
+        # Calculate metrics
+        metrics = {
+            "avg_time_ms": avg_time * 1000,
+            "throughput_tokens_per_sec": tokens / avg_time if tokens else None,
+            "memory_allocated_gb": mem_after["allocated"],
+            "memory_cached_gb": mem_after["cached"],
+            "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
+            "device": str(device) if device else "cpu",
+            "dtype": str(dtype) if dtype else "float32",
+            "tokens": tokens,
+            "warmup_iters": warmup,
+            "timing_iters": iters
+        }
+        # Print results
+        print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
+        if tokens:
+            print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
+        print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
+        print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")
+        # Save to JSON if requested
+        if save_json:
+            with open(save_json, 'w') as f:
+                json.dump(metrics, f, indent=2)
+        return result
+    yield run_benchmark

cells/visualization.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# /// script
+# dependencies = [
+#     "matplotlib",
+# ]
+# ///
+import json
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+import os
+# Get result directories from environment variables
+gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.')
+megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.')
+print(f"Loading benchmark results from:")
+print(f"  GPT-OSS dir: {gptoss_dir}")
+print(f"  MegaBlocks dir: {megablocks_dir}")
+# Load benchmark results
+gptoss_file = Path(gptoss_dir) / 'gptoss_results.json'
+megablocks_file = Path(megablocks_dir) / 'megablocks_results.json'
+print(f"Loading results from:")
+print(f"  GPT-OSS: {gptoss_file}")
+print(f"  MegaBlocks: {megablocks_file}")
+if not gptoss_file.exists():
+    print(f"Warning: {gptoss_file} not found")
+if not megablocks_file.exists():
+    print(f"Warning: {megablocks_file} not found")
+with open(gptoss_file, 'r') as f:
+    gptoss_results = json.load(f)
+with open(megablocks_file, 'r') as f:
+    megablocks_results = json.load(f)
+print(f"GPT-OSS results keys: {list(gptoss_results.keys())}")
+print(f"MegaBlocks results keys: {list(megablocks_results.keys())}")
+# Helper function to extract metrics from either old or new JSON format
+def get_metric(results, metric_name, default=0):
+    """Extract metric from results, handling both old and new JSON formats."""
+    # New format (with stats dict)
+    if 'stats' in results:
+        return results['stats'].get(metric_name, default)
+    # Old format (direct keys)
+    elif metric_name in results:
+        return results[metric_name]
+    else:
+        return default
+# Create comparison plots
+fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
+# Performance comparison
+implementations = ['GPT-OSS', 'MegaBlocks']
+# Extract timing metrics (handle both avg_ms and avg_time_ms)
+gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0))
+mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0))
+times = [gpt_time, mega_time]
+# Extract throughput metrics
+gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0))
+mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0))
+throughputs = [gpt_throughput, mega_throughput]
+# Extract memory metrics
+gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0)
+mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0)
+memory_usage = [gpt_memory, mega_memory]
+gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0)
+mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0)
+memory_increase = [gpt_mem_inc, mega_mem_inc]
+print(f"Extracted metrics:")
+print(f"  Times (ms): {times}")
+print(f"  Throughputs: {throughputs}")
+print(f"  Memory usage (GB): {memory_usage}")
+print(f"  Memory increase (GB): {memory_increase}")
+colors = ['#2E8B57', '#4169E1']
+# Latency comparison
+bars1 = ax1.bar(implementations, times, color=colors)
+ax1.set_ylabel('Average Time (ms)')
+ax1.set_title('Latency Comparison')
+ax1.grid(True, alpha=0.3)
+# Add values on bars
+for bar, time in zip(bars1, times):
+    height = bar.get_height()
+    ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
+             f'{time:.2f}ms', ha='center', va='bottom')
+# Throughput comparison
+bars2 = ax2.bar(implementations, throughputs, color=colors)
+ax2.set_ylabel('Tokens per Second')
+ax2.set_title('Throughput Comparison')
+ax2.grid(True, alpha=0.3)
+# Add values on bars
+for bar, throughput in zip(bars2, throughputs):
+    height = bar.get_height()
+    ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
+             f'{throughput:.0f}', ha='center', va='bottom')
+# Memory usage comparison
+bars3 = ax3.bar(implementations, memory_usage, color=colors)
+ax3.set_ylabel('Memory Allocated (GB)')
+ax3.set_title('Memory Usage Comparison')
+ax3.grid(True, alpha=0.3)
+# Add values on bars
+for bar, mem in zip(bars3, memory_usage):
+    height = bar.get_height()
+    ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
+             f'{mem:.2f}GB', ha='center', va='bottom')
+# Memory increase comparison
+bars4 = ax4.bar(implementations, memory_increase, color=colors)
+ax4.set_ylabel('Memory Increase (GB)')
+ax4.set_title('Memory Increase Comparison')
+ax4.grid(True, alpha=0.3)
+# Add values on bars
+for bar, mem_inc in zip(bars4, memory_increase):
+    height = bar.get_height()
+    ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
+             f'{mem_inc:.3f}GB', ha='center', va='bottom')
+plt.tight_layout()
+plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight')
+plt.show()
+# Print summary table
+print("\n" + "="*60)
+print("PERFORMANCE COMPARISON SUMMARY")
+print("="*60)
+print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}")
+print("-" * 60)
+# Determine winners
+latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks"
+throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks"
+memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks"
+mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks"
+print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}")
+print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}")
+print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}")
+print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}")
+# Speed ratio
+speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1]
+faster_impl = latency_winner
+print(f"\n{faster_impl} is {speed_ratio:.2f}x faster")
+# Throughput ratio
+throughput_ratio = max(throughputs) / min(throughputs)
+higher_throughput = throughput_winner
+print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput")
+print("="*60)

index.html CHANGED Viewed

The diff for this file is too large to render. See raw diff

small_compare.html ADDED Viewed

The diff for this file is too large to render. See raw diff