GPT-OSS vs MegaBlocks MoE Comparison

▶ code ▼ output ▶ uv-logs | Cell: nvidia_dump | deps: torch | 33.33s | Raw

"""Utility to dump NVIDIA GPU information."""
import subprocess

def nvidia_dump():
    """Dump NVIDIA GPU information."""
    try:
        result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True)
        print("NVIDIA GPU Information:")
        print(result.stdout)
    except FileNotFoundError:
        print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?")
    except subprocess.CalledProcessError as e:
        print(f"Error running nvidia-smi: {e}")

nvidia_dump()

▶ UV Install Logs

▶ code ▼ output ▶ uv-logs | Cell: utils | deps: torch, numpy | 31.77s | Raw

"""Simple utilities for running the models."""
import torch

def to_dtype(dtype_str: str):
    """Convert string to torch dtype."""
    if dtype_str == "float16":
        return torch.float16
    if dtype_str == "bfloat16":
        return torch.bfloat16
    return torch.float32

def tensor_stats(t: torch.Tensor) -> str:
    """Generate stats string for a tensor."""
    return (f"shape={tuple(t.shape)}, "
            f"dtype={t.dtype}, "
            f"device={t.device}, "
            f"mean={t.mean().item():.6f}, "
            f"std={t.std().item():.6f}")

def set_seed(seed: int):
    """Set seeds for reproducibility."""
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

"""Reusable benchmarking utilities for performance testing."""
import time
import numpy as np
from contextlib import contextmanager
from typing import Callable, Dict, Tuple, Any, Optional
import torch
import json

def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, 
                   input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]:
    """High precision timing function with warmup and optional input generation per iteration."""
    # Warmup
    for i in range(warmup):
        if input_generator:
            inputs = input_generator(i)
            func(inputs)
        else:
            func()

    if torch.cuda.is_available():
        torch.cuda.synchronize()

    start = time.perf_counter()
    result = None
    for i in range(iters):
        if input_generator:
            inputs = input_generator(i + warmup)  # Continue seed sequence after warmup
            result = func(inputs)
        else:
            result = func()

    if torch.cuda.is_available():
        torch.cuda.synchronize()

    end = time.perf_counter()
    avg_time = (end - start) / iters
    return result, avg_time

def memory_usage() -> Dict[str, float]:
    """Get current memory usage in GB."""
    if not torch.cuda.is_available():
        return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0}

    return {
        "allocated": torch.cuda.memory_allocated() / 1024**3,
        "cached": torch.cuda.memory_reserved() / 1024**3,
        "max_allocated": torch.cuda.max_memory_allocated() / 1024**3
    }

@contextmanager
def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, 
                  tokens: int = None, save_json: Optional[str] = None,
                  input_shape: Optional[Tuple] = None, input_seed_base: int = 42):
    """Context manager for benchmarking with comprehensive metrics and optional input generation."""

    def run_benchmark(model_func, *args, **kwargs):
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        mem_before = memory_usage()

        # Create input generator if input_shape is provided
        input_generator = None
        if input_shape is not None:
            def create_input(iteration: int):
                # Use deterministic but different seed for each iteration
                iteration_seed = input_seed_base + iteration * 123  # Spread out seeds
                torch.manual_seed(iteration_seed)
                if torch.cuda.is_available():
                    torch.cuda.manual_seed(iteration_seed)
                return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1
            input_generator = create_input

        if input_generator:
            result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator)
        else:
            result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters)

        mem_after = memory_usage()

        # Calculate metrics
        metrics = {
            "avg_time_ms": avg_time * 1000,
            "throughput_tokens_per_sec": tokens / avg_time if tokens else None,
            "memory_allocated_gb": mem_after["allocated"],
            "memory_cached_gb": mem_after["cached"],
            "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"],
            "device": str(device) if device else "cpu",
            "dtype": str(dtype) if dtype else "float32",
            "tokens": tokens,
            "warmup_iters": warmup,
            "timing_iters": iters
        }

        # Print results
        print(f"Average time: {metrics['avg_time_ms']:.3f} ms")
        if tokens:
            print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec")
        print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB")
        print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB")

        # Save to JSON if requested
        if save_json:
            with open(save_json, 'w') as f:
                json.dump(metrics, f, indent=2)

        return result

    yield run_benchmark

▶ UV Install Logs

▶ code ▼ output ▶ uv-logs | Cell: config | deps: torch, numpy | 37.88s | Raw

"""Configuration for MoE benchmarks."""
import torch

# Model configuration
NUM_EXPERTS = 128
HIDDEN_SIZE = 1152
TOP_K = 4

# Benchmark configuration  
BATCH_SIZE = 8
SEQ_LEN = 512
DTYPE = "bfloat16"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Seeds for reproducibility
WEIGHT_SEED = 999
EXPERT_SEED = 777
INPUT_SEED = 123
GENERAL_SEED = 42

print(f"Configuration:")
print(f"  Experts: {NUM_EXPERTS}")
print(f"  Hidden size: {HIDDEN_SIZE}")
print(f"  Top-k: {TOP_K}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Sequence length: {SEQ_LEN}")
print(f"  Device: {DEVICE}")
print(f"  Dtype: {DTYPE}")

Configuration: Experts: 128 Hidden size: 1152 Top-k: 4 Batch size: 8 Sequence length: 512 Device: cuda Dtype: bfloat16

▶ UV Install Logs

▼ code ▼ output ▶ uv-logs | Cell: save_data | deps: torch, numpy | 44.56s | Raw

"""Generate and save shared weights for consistent comparison."""
import torch
import numpy as np
from pathlib import Path

# Model configuration
NUM_EXPERTS = 128
HIDDEN_SIZE = 1152
INTERMEDIATE_SIZE = 3072
TOP_K = 4

# Input configuration
BATCH_SIZE = 1
SEQ_LEN = 100
DTYPE = "float32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Seeds for reproducibility
WEIGHT_SEED = 999
EXPERT_SEED = 777
INPUT_SEED = 123
GENERAL_SEED = 42

def set_seed(seed: int):
    """Set seeds for reproducibility."""
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

# Generate shared weights for all implementations
print("Generating shared weights...")

# Router weights
set_seed(WEIGHT_SEED)
router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE)
torch.nn.init.kaiming_uniform_(router_weight)
router_bias = torch.zeros(NUM_EXPERTS)

# Expert weights - using proper dimensions for gate/up combined projection
set_seed(EXPERT_SEED)
gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE)
down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02)
down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE)

# Save weights
torch.save(router_weight, 'router_weight.pt')
torch.save(router_bias, 'router_bias.pt')
torch.save(gate_up_proj, 'gate_up_proj.pt')
torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt')
torch.save(down_proj, 'down_proj.pt')
torch.save(down_proj_bias, 'down_proj_bias.pt')

print(f"Saved weights:")
print(f"  Router: {tuple(router_weight.shape)}")
print(f"  Gate/Up proj: {tuple(gate_up_proj.shape)}")
print(f"  Down proj: {tuple(down_proj.shape)}")
print(f"  Hidden size: {HIDDEN_SIZE}")

Generating shared weights... Saved weights: Router: (128, 1152) Gate/Up proj: (128, 1152, 2304) Down proj: (128, 1152, 1152) Hidden size: 1152

▶ UV Install Logs

Artifacts:

down_proj.pt down_proj_bias.pt gate_up_proj.pt gate_up_proj_bias.pt router_weight.pt router_bias.pt

▶ code ▼ output ▶ uv-logs | Cell: gptoss_run | deps: torch, numpy | 43.29s | Raw

import torch
from torch import nn
from torch.nn import functional as F
from utils import to_dtype, tensor_stats, set_seed, bench_context
from config import (
    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
)
from pathlib import Path
import os

# Discover the upstream artifact directory from env
data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')

# list all the files in the directory
print(f"Loading weights from: {data_dir}")
print(f"Files in directory: {list(Path(data_dir).glob('*'))}")

router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')

print("Loaded shared weights from artifacts")
print(f"Router weight sum: {router_weight.sum().item():.6f}")
print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
print(f"Down sum: {down_proj.sum().item():.6f}")

class GptOssRouter(nn.Module):
    def __init__(self, router_weight, router_bias):
        super().__init__()
        self.top_k = TOP_K
        self.num_experts = NUM_EXPERTS
        self.hidden_dim = HIDDEN_SIZE
        self.weight = nn.Parameter(router_weight.clone())
        self.bias = nn.Parameter(router_bias.clone())

    def forward(self, hidden_states):
        hidden_states = hidden_states.reshape(-1, self.hidden_dim)
        router_logits = F.linear(hidden_states, self.weight, self.bias)
        router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1)
        router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype)
        router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value)
        return router_scores, router_indices

class GptOssExperts(nn.Module):
    def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
        super().__init__()
        self.num_experts = NUM_EXPERTS
        self.hidden_size = HIDDEN_SIZE
        self.expert_dim = self.hidden_size
        self.gate_up_proj = nn.Parameter(gate_up_proj.clone())
        self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone())
        self.down_proj = nn.Parameter(down_proj.clone())
        self.down_proj_bias = nn.Parameter(down_proj_bias.clone())
        self.alpha = 1.702
        self.limit = 7.0

    def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor:
        batch_size = hidden_states.shape[0]
        hidden_states = hidden_states.reshape(-1, self.hidden_size)
        num_experts = routing_weights.shape[1]

        if hidden_states.device.type == "cpu" or self.training:
            next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device)
            with torch.no_grad():
                expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts)
                expert_mask = expert_mask.permute(2, 1, 0)
                expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()

            for expert_idx in expert_hit[:]:
                expert_idx = expert_idx[0]
                with torch.no_grad():
                    _, token_idx = torch.where(expert_mask[expert_idx])
                current_state = hidden_states[token_idx]
                gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx]
                gate, up = gate_up[..., ::2], gate_up[..., 1::2]
                gate = gate.clamp(min=None, max=self.limit)
                up = up.clamp(min=-self.limit, max=self.limit)
                glu = gate * torch.sigmoid(gate * self.alpha)
                gated_output = (up + 1) * glu
                out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx]
                weighted_output = out * routing_weights[token_idx, expert_idx, None]
                next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype))
            next_states = next_states.view(batch_size, -1, self.hidden_size)
        else:
            hidden_states = hidden_states.repeat(num_experts, 1)
            hidden_states = hidden_states.view(num_experts, -1, self.hidden_size)
            gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :]
            gate, up = gate_up[..., ::2], gate_up[..., 1::2]
            gate = gate.clamp(min=None, max=self.limit)
            up = up.clamp(min=-self.limit, max=self.limit)
            glu = gate * torch.sigmoid(gate * self.alpha)
            next_states = torch.bmm(((up + 1) * glu), self.down_proj)
            next_states = next_states + self.down_proj_bias[..., None, :]
            next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size)
            next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
            next_states = next_states.sum(dim=0)
        return next_states

class GptOssMoEMLP(nn.Module):
    def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias):
        super().__init__()
        self.router = GptOssRouter(router_weight, router_bias)
        self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias)

    def forward(self, hidden_states):
        router_scores, router_indices = self.router(hidden_states)
        routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores)
        return routed_out, router_scores

# Run the model
set_seed(GENERAL_SEED)

device = torch.device(DEVICE)
dtype = to_dtype(DTYPE)

print("\n=== GPT-OSS Implementation ===")
# Initialize model with loaded weights
model = GptOssMoEMLP(
    router_weight.to(device, dtype=dtype),
    router_bias.to(device, dtype=dtype),
    gate_up_proj.to(device, dtype=dtype),
    gate_up_proj_bias.to(device, dtype=dtype),
    down_proj.to(device, dtype=dtype),
    down_proj_bias.to(device, dtype=dtype)
).to(device=device, dtype=dtype)

print(f"Router weight sum: {model.router.weight.sum().item():.6f}")
print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}")
print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}")

# Benchmark the model using different input tensors on each iteration
tokens = BATCH_SIZE * SEQ_LEN
input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, 
                   save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
    output, stats = bench(model)
    print(f"\nOutput sum: {output[0].sum().item():.6f}")

Configuration: Experts: 128 Hidden size: 1152 Top-k: 4 Batch size: 8 Sequence length: 512 Device: cuda Dtype: bfloat16 Loading weights from: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e Files in directory: [PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/down_proj.pt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/down_proj_bias.pt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/stderr.txt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/gate_up_proj.pt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/gate_up_proj_bias.pt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/result.json'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/stdout.txt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/router_weight.pt'), PosixPath('/home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e/router_bias.pt')] Loaded shared weights from artifacts Router weight sum: 12.588732 Gate/up sum: 1026.601807 Down sum: 206.729263 === GPT-OSS Implementation === Router weight sum: 12.562500 Gate/up proj sum: 1024.000000 Down proj sum: 207.000000 Average time: 62.308 ms Throughput: 65737 tokens/sec Memory allocated: 1.330 GB Memory increase: 0.380 GB Output sum: -4.968750

▶ UV Install Logs

Artifacts:

gptoss_results.json

▶ code ▼ output ▶ uv-logs | Cell: megablocks_run | deps: torch, numpy, kernels | 49.81s | Raw

import torch
from torch import nn
from torch.nn import functional as F
from kernels import get_kernel, get_local_kernel
from utils import to_dtype, tensor_stats, set_seed, bench_context
from config import (
    NUM_EXPERTS, HIDDEN_SIZE, TOP_K,
    BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE,
    WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED
)
from pathlib import Path
from collections import namedtuple
import os

# Discover the upstream artifact directory from env
data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.')

print(f"Loading weights from: {data_dir}")

router_weight = torch.load(Path(data_dir) / 'router_weight.pt')
router_bias = torch.load(Path(data_dir) / 'router_bias.pt')
gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt')
gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt')
down_proj = torch.load(Path(data_dir) / 'down_proj.pt')
down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt')

print("Loaded shared weights from artifacts")
print(f"Router weight sum: {router_weight.sum().item():.6f}")
print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}")
print(f"Down sum: {down_proj.sum().item():.6f}")

def build_megablocks_model(device: torch.device, dtype: torch.dtype):
    # Download optimized kernels from the Hugging Face hub
    megablocks = get_kernel("kernels-community/megablocks")

    # megablocks = get_local_kernel(
    #     Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks")

    model = megablocks.layers.MegaBlocksMoeMLP()

    # Create attribute container for expert weights
    model.experts = namedtuple(
        "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"]
    )

    # Use loaded router weights for consistency
    model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype)
    with torch.no_grad():
        model.router.weight.copy_(router_weight.to(dtype))
        model.router.bias.copy_(router_bias.to(dtype))

    # Attach loaded expert weights to the experts container
    e = model.experts
    e.alpha = 1.702
    e.capacity_factor = 4
    e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype))
    e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype))
    e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype))
    e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype))
    e.hidden_size = HIDDEN_SIZE

    # Log weight statistics for comparison
    print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}")
    print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}")
    print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}")

    return model

# Create a wrapper to match the interface of other implementations
class MegaBlocksMoEWrapper(nn.Module):
    def __init__(self, megablocks_model):
        super().__init__()
        self.model = megablocks_model

    def forward(self, hidden_states):
        # MegaBlocks expects input in the format (batch, seq_len, hidden_dim)
        output, dummy_routing_weights = self.model(hidden_states)
        # Return output and dummy routing weights for consistency with other implementations
        # dummy_routing_weights = torch.zeros(
        #     hidden_states.shape[0] * hidden_states.shape[1], 
        #     NUM_EXPERTS, 
        #     device=hidden_states.device,
        #     dtype=hidden_states.dtype
        # )
        return output, dummy_routing_weights

# Run the model
set_seed(GENERAL_SEED)

device = torch.device(DEVICE)
dtype = to_dtype(DTYPE)

print("\n=== MegaBlocks Implementation ===")
# Build MegaBlocks model with loaded weights
megablocks_model = build_megablocks_model(device, dtype)
model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype)

# Benchmark the model using different input tensors on each iteration
tokens = BATCH_SIZE * SEQ_LEN
input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE)
with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, 
                   save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench:
    output, stats = bench(model)
    print(f"\nOutput sum: {output[0].sum().item():.6f}")

Configuration: Experts: 128 Hidden size: 1152 Top-k: 4 Batch size: 8 Sequence length: 512 Device: cuda Dtype: bfloat16 Loading weights from: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/f8d80463181591394d703c9cd286c7929b4a261ab3157d791f92a5933e5a011e Loaded shared weights from artifacts Router weight sum: 12.588732 Gate/up sum: 1026.601807 Down sum: 206.729263 === MegaBlocks Implementation === [MegaBlocks] Router weight sum: 12.562500 [MegaBlocks] Gate/up projection shape: (128, 1152, 2304), sum: 1024.000000 [MegaBlocks] Down projection shape: (128, 1152, 1152), sum: 207.000000 Average time: 26.933 ms Throughput: 152084 tokens/sec Memory allocated: 2.243 GB Memory increase: 1.292 GB Output sum: -4.968750

▶ UV Install Logs

Fetching 66 files: 0%| | 0/66 [00:00<?, ?it/s] Fetching 66 files: 2%|▏ | 1/66 [00:00<00:18, 3.49it/s] Fetching 66 files: 26%|██▌ | 17/66 [00:01<00:03, 15.86it/s] Fetching 66 files: 100%|██████████| 66/66 [00:01<00:00, 57.94it/s]

Artifacts:

megablocks_results.json

▶ code ▼ output ▶ uv-logs | Cell: visualization | deps: matplotlib | 3.33s | Raw

import json
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import os

# Get result directories from environment variables
gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.')
megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.')

print(f"Loading benchmark results from:")
print(f"  GPT-OSS dir: {gptoss_dir}")
print(f"  MegaBlocks dir: {megablocks_dir}")

# Load benchmark results
gptoss_file = Path(gptoss_dir) / 'gptoss_results.json'
megablocks_file = Path(megablocks_dir) / 'megablocks_results.json'

print(f"Loading results from:")
print(f"  GPT-OSS: {gptoss_file}")
print(f"  MegaBlocks: {megablocks_file}")

if not gptoss_file.exists():
    print(f"Warning: {gptoss_file} not found")
if not megablocks_file.exists():
    print(f"Warning: {megablocks_file} not found")

with open(gptoss_file, 'r') as f:
    gptoss_results = json.load(f)

with open(megablocks_file, 'r') as f:
    megablocks_results = json.load(f)

print(f"GPT-OSS results keys: {list(gptoss_results.keys())}")
print(f"MegaBlocks results keys: {list(megablocks_results.keys())}")

# Helper function to extract metrics from either old or new JSON format
def get_metric(results, metric_name, default=0):
    """Extract metric from results, handling both old and new JSON formats."""
    # New format (with stats dict)
    if 'stats' in results:
        return results['stats'].get(metric_name, default)
    # Old format (direct keys)
    elif metric_name in results:
        return results[metric_name]
    else:
        return default

# Create comparison plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Performance comparison
implementations = ['GPT-OSS', 'MegaBlocks']

# Extract timing metrics (handle both avg_ms and avg_time_ms)
gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0))
mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0))
times = [gpt_time, mega_time]

# Extract throughput metrics
gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0))
mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0))
throughputs = [gpt_throughput, mega_throughput]

# Extract memory metrics
gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0)
mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0)
memory_usage = [gpt_memory, mega_memory]

gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0)
mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0)
memory_increase = [gpt_mem_inc, mega_mem_inc]

print(f"Extracted metrics:")
print(f"  Times (ms): {times}")
print(f"  Throughputs: {throughputs}")
print(f"  Memory usage (GB): {memory_usage}")
print(f"  Memory increase (GB): {memory_increase}")

colors = ['#2E8B57', '#4169E1']

# Latency comparison
bars1 = ax1.bar(implementations, times, color=colors)
ax1.set_ylabel('Average Time (ms)')
ax1.set_title('Latency Comparison')
ax1.grid(True, alpha=0.3)

# Add values on bars
for bar, time in zip(bars1, times):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{time:.2f}ms', ha='center', va='bottom')

# Throughput comparison  
bars2 = ax2.bar(implementations, throughputs, color=colors)
ax2.set_ylabel('Tokens per Second')
ax2.set_title('Throughput Comparison')
ax2.grid(True, alpha=0.3)

# Add values on bars
for bar, throughput in zip(bars2, throughputs):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{throughput:.0f}', ha='center', va='bottom')

# Memory usage comparison
bars3 = ax3.bar(implementations, memory_usage, color=colors)
ax3.set_ylabel('Memory Allocated (GB)')
ax3.set_title('Memory Usage Comparison')
ax3.grid(True, alpha=0.3)

# Add values on bars
for bar, mem in zip(bars3, memory_usage):
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{mem:.2f}GB', ha='center', va='bottom')

# Memory increase comparison
bars4 = ax4.bar(implementations, memory_increase, color=colors)
ax4.set_ylabel('Memory Increase (GB)')
ax4.set_title('Memory Increase Comparison')
ax4.grid(True, alpha=0.3)

# Add values on bars
for bar, mem_inc in zip(bars4, memory_increase):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01,
             f'{mem_inc:.3f}GB', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# Print summary table
print("\n" + "="*60)
print("PERFORMANCE COMPARISON SUMMARY")
print("="*60)
print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}")
print("-" * 60)

# Determine winners
latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks"
throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks" 
memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks"
mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks"

print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}")
print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}")
print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}")
print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}")

# Speed ratio
speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1]
faster_impl = latency_winner
print(f"\n{faster_impl} is {speed_ratio:.2f}x faster")

# Throughput ratio  
throughput_ratio = max(throughputs) / min(throughputs)
higher_throughput = throughput_winner
print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput")

print("="*60)

Loading benchmark results from: GPT-OSS dir: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/fc17d5998a27217e1676a638ddeceb18cab662c6e9b30c9a62218784604c9a26 MegaBlocks dir: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/6e3545a8e3c2ca65ca800a7e1c1824fded11e28258efcd83355514bb0646e166 Loading results from: GPT-OSS: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/fc17d5998a27217e1676a638ddeceb18cab662c6e9b30c9a62218784604c9a26/gptoss_results.json MegaBlocks: /home/ubuntu/Projects/uvnote-megablocks-bench/.uvnote/cache/6e3545a8e3c2ca65ca800a7e1c1824fded11e28258efcd83355514bb0646e166/megablocks_results.json GPT-OSS results keys: ['avg_time_ms', 'throughput_tokens_per_sec', 'memory_allocated_gb', 'memory_cached_gb', 'memory_increase_gb', 'device', 'dtype', 'tokens', 'warmup_iters', 'timing_iters'] MegaBlocks results keys: ['avg_time_ms', 'throughput_tokens_per_sec', 'memory_allocated_gb', 'memory_cached_gb', 'memory_increase_gb', 'device', 'dtype', 'tokens', 'warmup_iters', 'timing_iters'] Extracted metrics: Times (ms): [62.308485079556704, 26.93254135781899] Throughputs: [65737.43519474348, 152083.67994618745] Memory usage (GB): [1.329831600189209, 2.2425241470336914] Memory increase (GB): [0.3795137405395508, 1.2922062873840332] ============================================================ PERFORMANCE COMPARISON SUMMARY ============================================================ Metric GPT-OSS MegaBlocks Winner ------------------------------------------------------------ Latency (ms) 62.31 26.93 MegaBlocks Throughput (tok/s) 65737 152084 MegaBlocks Memory Usage (GB) 1.330 2.243 GPT-OSS Memory Increase (GB) 0.380 1.292 GPT-OSS MegaBlocks is 2.31x faster MegaBlocks has 2.31x higher throughput ============================================================

▶ UV Install Logs

Artifacts:

small_moe_comparison.png

Artifacts:

GPT-OSS Implementation

Artifacts:

MegaBlocks Implementation

Artifacts:

Performance Comparison

Artifacts:

Conclusion