1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | """Utility to dump NVIDIA GPU information.""" import subprocess def nvidia_dump(): """Dump NVIDIA GPU information.""" try: result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, check=True) print("NVIDIA GPU Information:") print(result.stdout) except FileNotFoundError: print("nvidia-smi not found. Are you running on a machine with NVIDIA GPUs?") except subprocess.CalledProcessError as e: print(f"Error running nvidia-smi: {e}") nvidia_dump() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """Simple utilities for running the models.""" import torch def to_dtype(dtype_str: str): """Convert string to torch dtype.""" if dtype_str == "float16": return torch.float16 if dtype_str == "bfloat16": return torch.bfloat16 return torch.float32 def tensor_stats(t: torch.Tensor) -> str: """Generate stats string for a tensor.""" return (f"shape={tuple(t.shape)}, " f"dtype={t.dtype}, " f"device={t.device}, " f"mean={t.mean().item():.6f}, " f"std={t.std().item():.6f}") def set_seed(seed: int): """Set seeds for reproducibility.""" torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False """Reusable benchmarking utilities for performance testing.""" import time import numpy as np from contextlib import contextmanager from typing import Callable, Dict, Tuple, Any, Optional import torch import json def precise_timing(func: Callable[[], Any], warmup: int = 5, iters: int = 20, input_generator: Optional[Callable[[int], Any]] = None) -> Tuple[Any, float]: """High precision timing function with warmup and optional input generation per iteration.""" # Warmup for i in range(warmup): if input_generator: inputs = input_generator(i) func(inputs) else: func() if torch.cuda.is_available(): torch.cuda.synchronize() start = time.perf_counter() result = None for i in range(iters): if input_generator: inputs = input_generator(i + warmup) # Continue seed sequence after warmup result = func(inputs) else: result = func() if torch.cuda.is_available(): torch.cuda.synchronize() end = time.perf_counter() avg_time = (end - start) / iters return result, avg_time def memory_usage() -> Dict[str, float]: """Get current memory usage in GB.""" if not torch.cuda.is_available(): return {"allocated": 0.0, "cached": 0.0, "max_allocated": 0.0} return { "allocated": torch.cuda.memory_allocated() / 1024**3, "cached": torch.cuda.memory_reserved() / 1024**3, "max_allocated": torch.cuda.max_memory_allocated() / 1024**3 } @contextmanager def bench_context(warmup: int = 10, iters: int = 50, device=None, dtype=None, tokens: int = None, save_json: Optional[str] = None, input_shape: Optional[Tuple] = None, input_seed_base: int = 42): """Context manager for benchmarking with comprehensive metrics and optional input generation.""" def run_benchmark(model_func, *args, **kwargs): torch.cuda.empty_cache() if torch.cuda.is_available() else None mem_before = memory_usage() # Create input generator if input_shape is provided input_generator = None if input_shape is not None: def create_input(iteration: int): # Use deterministic but different seed for each iteration iteration_seed = input_seed_base + iteration * 123 # Spread out seeds torch.manual_seed(iteration_seed) if torch.cuda.is_available(): torch.cuda.manual_seed(iteration_seed) return torch.randn(*input_shape, device=device, dtype=dtype) * 0.1 input_generator = create_input if input_generator: result, avg_time = precise_timing(lambda x: model_func(x), warmup, iters, input_generator) else: result, avg_time = precise_timing(lambda: model_func(*args, **kwargs), warmup, iters) mem_after = memory_usage() # Calculate metrics metrics = { "avg_time_ms": avg_time * 1000, "throughput_tokens_per_sec": tokens / avg_time if tokens else None, "memory_allocated_gb": mem_after["allocated"], "memory_cached_gb": mem_after["cached"], "memory_increase_gb": mem_after["allocated"] - mem_before["allocated"], "device": str(device) if device else "cpu", "dtype": str(dtype) if dtype else "float32", "tokens": tokens, "warmup_iters": warmup, "timing_iters": iters } # Print results print(f"Average time: {metrics['avg_time_ms']:.3f} ms") if tokens: print(f"Throughput: {metrics['throughput_tokens_per_sec']:.0f} tokens/sec") print(f"Memory allocated: {metrics['memory_allocated_gb']:.3f} GB") print(f"Memory increase: {metrics['memory_increase_gb']:.3f} GB") # Save to JSON if requested if save_json: with open(save_json, 'w') as f: json.dump(metrics, f, indent=2) return result yield run_benchmark |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | """Configuration for MoE benchmarks.""" import torch # Model configuration NUM_EXPERTS = 128 HIDDEN_SIZE = 1152 TOP_K = 4 # Benchmark configuration BATCH_SIZE = 8 SEQ_LEN = 512 DTYPE = "bfloat16" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Seeds for reproducibility WEIGHT_SEED = 999 EXPERT_SEED = 777 INPUT_SEED = 123 GENERAL_SEED = 42 print(f"Configuration:") print(f" Experts: {NUM_EXPERTS}") print(f" Hidden size: {HIDDEN_SIZE}") print(f" Top-k: {TOP_K}") print(f" Batch size: {BATCH_SIZE}") print(f" Sequence length: {SEQ_LEN}") print(f" Device: {DEVICE}") print(f" Dtype: {DTYPE}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | """Generate and save shared weights for consistent comparison.""" import torch import numpy as np from pathlib import Path # Model configuration NUM_EXPERTS = 128 HIDDEN_SIZE = 1152 INTERMEDIATE_SIZE = 3072 TOP_K = 4 # Input configuration BATCH_SIZE = 1 SEQ_LEN = 100 DTYPE = "float32" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Seeds for reproducibility WEIGHT_SEED = 999 EXPERT_SEED = 777 INPUT_SEED = 123 GENERAL_SEED = 42 def set_seed(seed: int): """Set seeds for reproducibility.""" torch.manual_seed(seed) np.random.seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Generate shared weights for all implementations print("Generating shared weights...") # Router weights set_seed(WEIGHT_SEED) router_weight = torch.empty(NUM_EXPERTS, HIDDEN_SIZE) torch.nn.init.kaiming_uniform_(router_weight) router_bias = torch.zeros(NUM_EXPERTS) # Expert weights - using proper dimensions for gate/up combined projection set_seed(EXPERT_SEED) gate_up_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, 2 * HIDDEN_SIZE).normal_(mean=0.0, std=0.02) gate_up_proj_bias = torch.zeros(NUM_EXPERTS, 2 * HIDDEN_SIZE) down_proj = torch.empty(NUM_EXPERTS, HIDDEN_SIZE, HIDDEN_SIZE).normal_(mean=0.0, std=0.02) down_proj_bias = torch.zeros(NUM_EXPERTS, HIDDEN_SIZE) # Save weights torch.save(router_weight, 'router_weight.pt') torch.save(router_bias, 'router_bias.pt') torch.save(gate_up_proj, 'gate_up_proj.pt') torch.save(gate_up_proj_bias, 'gate_up_proj_bias.pt') torch.save(down_proj, 'down_proj.pt') torch.save(down_proj_bias, 'down_proj_bias.pt') print(f"Saved weights:") print(f" Router: {tuple(router_weight.shape)}") print(f" Gate/Up proj: {tuple(gate_up_proj.shape)}") print(f" Down proj: {tuple(down_proj.shape)}") print(f" Hidden size: {HIDDEN_SIZE}") |
GPT-OSS Implementation
This section benchmarks the GPT-OSS MoE implementation in non-training mode.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import torch from torch import nn from torch.nn import functional as F from utils import to_dtype, tensor_stats, set_seed, bench_context from config import ( NUM_EXPERTS, HIDDEN_SIZE, TOP_K, BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE, WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED ) from pathlib import Path import os # Discover the upstream artifact directory from env data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.') # list all the files in the directory print(f"Loading weights from: {data_dir}") print(f"Files in directory: {list(Path(data_dir).glob('*'))}") router_weight = torch.load(Path(data_dir) / 'router_weight.pt') router_bias = torch.load(Path(data_dir) / 'router_bias.pt') gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt') gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt') down_proj = torch.load(Path(data_dir) / 'down_proj.pt') down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt') print("Loaded shared weights from artifacts") print(f"Router weight sum: {router_weight.sum().item():.6f}") print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") print(f"Down sum: {down_proj.sum().item():.6f}") class GptOssRouter(nn.Module): def __init__(self, router_weight, router_bias): super().__init__() self.top_k = TOP_K self.num_experts = NUM_EXPERTS self.hidden_dim = HIDDEN_SIZE self.weight = nn.Parameter(router_weight.clone()) self.bias = nn.Parameter(router_bias.clone()) def forward(self, hidden_states): hidden_states = hidden_states.reshape(-1, self.hidden_dim) router_logits = F.linear(hidden_states, self.weight, self.bias) router_top_value, router_indices = torch.topk(router_logits, self.top_k, dim=-1) router_top_value = torch.nn.functional.softmax(router_top_value, dim=1, dtype=router_top_value.dtype) router_scores = torch.zeros_like(router_logits).scatter_(1, router_indices, router_top_value) return router_scores, router_indices class GptOssExperts(nn.Module): def __init__(self, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias): super().__init__() self.num_experts = NUM_EXPERTS self.hidden_size = HIDDEN_SIZE self.expert_dim = self.hidden_size self.gate_up_proj = nn.Parameter(gate_up_proj.clone()) self.gate_up_proj_bias = nn.Parameter(gate_up_proj_bias.clone()) self.down_proj = nn.Parameter(down_proj.clone()) self.down_proj_bias = nn.Parameter(down_proj_bias.clone()) self.alpha = 1.702 self.limit = 7.0 def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weights=None) -> torch.Tensor: batch_size = hidden_states.shape[0] hidden_states = hidden_states.reshape(-1, self.hidden_size) num_experts = routing_weights.shape[1] if hidden_states.device.type == "cpu" or self.training: next_states = torch.zeros_like(hidden_states, dtype=hidden_states.dtype, device=hidden_states.device) with torch.no_grad(): expert_mask = torch.nn.functional.one_hot(router_indices, num_classes=num_experts) expert_mask = expert_mask.permute(2, 1, 0) expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() for expert_idx in expert_hit[:]: expert_idx = expert_idx[0] with torch.no_grad(): _, token_idx = torch.where(expert_mask[expert_idx]) current_state = hidden_states[token_idx] gate_up = current_state @ self.gate_up_proj[expert_idx] + self.gate_up_proj_bias[expert_idx] gate, up = gate_up[..., ::2], gate_up[..., 1::2] gate = gate.clamp(min=None, max=self.limit) up = up.clamp(min=-self.limit, max=self.limit) glu = gate * torch.sigmoid(gate * self.alpha) gated_output = (up + 1) * glu out = gated_output @ self.down_proj[expert_idx] + self.down_proj_bias[expert_idx] weighted_output = out * routing_weights[token_idx, expert_idx, None] next_states.index_add_(0, token_idx, weighted_output.to(hidden_states.dtype)) next_states = next_states.view(batch_size, -1, self.hidden_size) else: hidden_states = hidden_states.repeat(num_experts, 1) hidden_states = hidden_states.view(num_experts, -1, self.hidden_size) gate_up = torch.bmm(hidden_states, self.gate_up_proj) + self.gate_up_proj_bias[..., None, :] gate, up = gate_up[..., ::2], gate_up[..., 1::2] gate = gate.clamp(min=None, max=self.limit) up = up.clamp(min=-self.limit, max=self.limit) glu = gate * torch.sigmoid(gate * self.alpha) next_states = torch.bmm(((up + 1) * glu), self.down_proj) next_states = next_states + self.down_proj_bias[..., None, :] next_states = next_states.view(num_experts, batch_size, -1, self.hidden_size) next_states = next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None] next_states = next_states.sum(dim=0) return next_states class GptOssMoEMLP(nn.Module): def __init__(self, router_weight, router_bias, gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias): super().__init__() self.router = GptOssRouter(router_weight, router_bias) self.experts = GptOssExperts(gate_up_proj, gate_up_proj_bias, down_proj, down_proj_bias) def forward(self, hidden_states): router_scores, router_indices = self.router(hidden_states) routed_out = self.experts(hidden_states, router_indices=router_indices, routing_weights=router_scores) return routed_out, router_scores # Run the model set_seed(GENERAL_SEED) device = torch.device(DEVICE) dtype = to_dtype(DTYPE) print("\n=== GPT-OSS Implementation ===") # Initialize model with loaded weights model = GptOssMoEMLP( router_weight.to(device, dtype=dtype), router_bias.to(device, dtype=dtype), gate_up_proj.to(device, dtype=dtype), gate_up_proj_bias.to(device, dtype=dtype), down_proj.to(device, dtype=dtype), down_proj_bias.to(device, dtype=dtype) ).to(device=device, dtype=dtype) print(f"Router weight sum: {model.router.weight.sum().item():.6f}") print(f"Gate/up proj sum: {model.experts.gate_up_proj.sum().item():.6f}") print(f"Down proj sum: {model.experts.down_proj.sum().item():.6f}") # Benchmark the model using different input tensors on each iteration tokens = BATCH_SIZE * SEQ_LEN input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="gptoss_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench: output, stats = bench(model) print(f"\nOutput sum: {output[0].sum().item():.6f}") |
Artifacts:
gptoss_results.jsonMegaBlocks Implementation
This section benchmarks the MegaBlocks MoE implementation.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import torch from torch import nn from torch.nn import functional as F from kernels import get_kernel, get_local_kernel from utils import to_dtype, tensor_stats, set_seed, bench_context from config import ( NUM_EXPERTS, HIDDEN_SIZE, TOP_K, BATCH_SIZE, SEQ_LEN, DTYPE, DEVICE, WEIGHT_SEED, EXPERT_SEED, INPUT_SEED, GENERAL_SEED ) from pathlib import Path from collections import namedtuple import os # Discover the upstream artifact directory from env data_dir = os.environ.get('UVNOTE_INPUT_SAVE_DATA', '.') print(f"Loading weights from: {data_dir}") router_weight = torch.load(Path(data_dir) / 'router_weight.pt') router_bias = torch.load(Path(data_dir) / 'router_bias.pt') gate_up_proj = torch.load(Path(data_dir) / 'gate_up_proj.pt') gate_up_proj_bias = torch.load(Path(data_dir) / 'gate_up_proj_bias.pt') down_proj = torch.load(Path(data_dir) / 'down_proj.pt') down_proj_bias = torch.load(Path(data_dir) / 'down_proj_bias.pt') print("Loaded shared weights from artifacts") print(f"Router weight sum: {router_weight.sum().item():.6f}") print(f"Gate/up sum: {gate_up_proj.sum().item():.6f}") print(f"Down sum: {down_proj.sum().item():.6f}") def build_megablocks_model(device: torch.device, dtype: torch.dtype): # Download optimized kernels from the Hugging Face hub megablocks = get_kernel("kernels-community/megablocks") # megablocks = get_local_kernel( # Path("/home/ubuntu/Projects/megablocks-moe/build"), "megablocks") model = megablocks.layers.MegaBlocksMoeMLP() # Create attribute container for expert weights model.experts = namedtuple( "Experts", ["gate_up_proj", "gate_up_proj_bias", "down_proj", "down_proj_bias", "hidden_size"] ) # Use loaded router weights for consistency model.router = torch.nn.Linear(HIDDEN_SIZE, NUM_EXPERTS, device=device, dtype=dtype) with torch.no_grad(): model.router.weight.copy_(router_weight.to(dtype)) model.router.bias.copy_(router_bias.to(dtype)) # Attach loaded expert weights to the experts container e = model.experts e.alpha = 1.702 e.capacity_factor = 4 e.gate_up_proj = torch.nn.Parameter(gate_up_proj.clone().to(device, dtype=dtype)) e.gate_up_proj_bias = torch.nn.Parameter(gate_up_proj_bias.clone().to(device, dtype=dtype)) e.down_proj = torch.nn.Parameter(down_proj.clone().to(device, dtype=dtype)) e.down_proj_bias = torch.nn.Parameter(down_proj_bias.clone().to(device, dtype=dtype)) e.hidden_size = HIDDEN_SIZE # Log weight statistics for comparison print(f"[MegaBlocks] Router weight sum: {model.router.weight.sum().item():.6f}") print(f"[MegaBlocks] Gate/up projection shape: {tuple(e.gate_up_proj.shape)}, sum: {e.gate_up_proj.sum().item():.6f}") print(f"[MegaBlocks] Down projection shape: {tuple(e.down_proj.shape)}, sum: {e.down_proj.sum().item():.6f}") return model # Create a wrapper to match the interface of other implementations class MegaBlocksMoEWrapper(nn.Module): def __init__(self, megablocks_model): super().__init__() self.model = megablocks_model def forward(self, hidden_states): # MegaBlocks expects input in the format (batch, seq_len, hidden_dim) output, dummy_routing_weights = self.model(hidden_states) # Return output and dummy routing weights for consistency with other implementations # dummy_routing_weights = torch.zeros( # hidden_states.shape[0] * hidden_states.shape[1], # NUM_EXPERTS, # device=hidden_states.device, # dtype=hidden_states.dtype # ) return output, dummy_routing_weights # Run the model set_seed(GENERAL_SEED) device = torch.device(DEVICE) dtype = to_dtype(DTYPE) print("\n=== MegaBlocks Implementation ===") # Build MegaBlocks model with loaded weights megablocks_model = build_megablocks_model(device, dtype) model = MegaBlocksMoEWrapper(megablocks_model).to(device=device, dtype=dtype) # Benchmark the model using different input tensors on each iteration tokens = BATCH_SIZE * SEQ_LEN input_shape = (BATCH_SIZE, SEQ_LEN, HIDDEN_SIZE) with bench_context(warmup=10, iters=50, device=device, dtype=dtype, tokens=tokens, save_json="megablocks_results.json", input_shape=input_shape, input_seed_base=INPUT_SEED) as bench: output, stats = bench(model) print(f"\nOutput sum: {output[0].sum().item():.6f}") |
Artifacts:
megablocks_results.jsonPerformance Comparison
This section loads the benchmark results and creates visualizations comparing the two implementations.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | import json import matplotlib.pyplot as plt import numpy as np from pathlib import Path import os # Get result directories from environment variables gptoss_dir = os.environ.get('UVNOTE_INPUT_GPTOSS_RUN', '.') megablocks_dir = os.environ.get('UVNOTE_INPUT_MEGABLOCKS_RUN', '.') print(f"Loading benchmark results from:") print(f" GPT-OSS dir: {gptoss_dir}") print(f" MegaBlocks dir: {megablocks_dir}") # Load benchmark results gptoss_file = Path(gptoss_dir) / 'gptoss_results.json' megablocks_file = Path(megablocks_dir) / 'megablocks_results.json' print(f"Loading results from:") print(f" GPT-OSS: {gptoss_file}") print(f" MegaBlocks: {megablocks_file}") if not gptoss_file.exists(): print(f"Warning: {gptoss_file} not found") if not megablocks_file.exists(): print(f"Warning: {megablocks_file} not found") with open(gptoss_file, 'r') as f: gptoss_results = json.load(f) with open(megablocks_file, 'r') as f: megablocks_results = json.load(f) print(f"GPT-OSS results keys: {list(gptoss_results.keys())}") print(f"MegaBlocks results keys: {list(megablocks_results.keys())}") # Helper function to extract metrics from either old or new JSON format def get_metric(results, metric_name, default=0): """Extract metric from results, handling both old and new JSON formats.""" # New format (with stats dict) if 'stats' in results: return results['stats'].get(metric_name, default) # Old format (direct keys) elif metric_name in results: return results[metric_name] else: return default # Create comparison plots fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10)) # Performance comparison implementations = ['GPT-OSS', 'MegaBlocks'] # Extract timing metrics (handle both avg_ms and avg_time_ms) gpt_time = get_metric(gptoss_results, 'avg_ms', get_metric(gptoss_results, 'avg_time_ms', 0)) mega_time = get_metric(megablocks_results, 'avg_ms', get_metric(megablocks_results, 'avg_time_ms', 0)) times = [gpt_time, mega_time] # Extract throughput metrics gpt_throughput = get_metric(gptoss_results, 'tokens_per_s', get_metric(gptoss_results, 'throughput_tokens_per_sec', 0)) mega_throughput = get_metric(megablocks_results, 'tokens_per_s', get_metric(megablocks_results, 'throughput_tokens_per_sec', 0)) throughputs = [gpt_throughput, mega_throughput] # Extract memory metrics gpt_memory = get_metric(gptoss_results, 'memory_allocated_gb', 0) mega_memory = get_metric(megablocks_results, 'memory_allocated_gb', 0) memory_usage = [gpt_memory, mega_memory] gpt_mem_inc = get_metric(gptoss_results, 'memory_increase_gb', 0) mega_mem_inc = get_metric(megablocks_results, 'memory_increase_gb', 0) memory_increase = [gpt_mem_inc, mega_mem_inc] print(f"Extracted metrics:") print(f" Times (ms): {times}") print(f" Throughputs: {throughputs}") print(f" Memory usage (GB): {memory_usage}") print(f" Memory increase (GB): {memory_increase}") colors = ['#2E8B57', '#4169E1'] # Latency comparison bars1 = ax1.bar(implementations, times, color=colors) ax1.set_ylabel('Average Time (ms)') ax1.set_title('Latency Comparison') ax1.grid(True, alpha=0.3) # Add values on bars for bar, time in zip(bars1, times): height = bar.get_height() ax1.text(bar.get_x() + bar.get_width()/2., height + height*0.01, f'{time:.2f}ms', ha='center', va='bottom') # Throughput comparison bars2 = ax2.bar(implementations, throughputs, color=colors) ax2.set_ylabel('Tokens per Second') ax2.set_title('Throughput Comparison') ax2.grid(True, alpha=0.3) # Add values on bars for bar, throughput in zip(bars2, throughputs): height = bar.get_height() ax2.text(bar.get_x() + bar.get_width()/2., height + height*0.01, f'{throughput:.0f}', ha='center', va='bottom') # Memory usage comparison bars3 = ax3.bar(implementations, memory_usage, color=colors) ax3.set_ylabel('Memory Allocated (GB)') ax3.set_title('Memory Usage Comparison') ax3.grid(True, alpha=0.3) # Add values on bars for bar, mem in zip(bars3, memory_usage): height = bar.get_height() ax3.text(bar.get_x() + bar.get_width()/2., height + height*0.01, f'{mem:.2f}GB', ha='center', va='bottom') # Memory increase comparison bars4 = ax4.bar(implementations, memory_increase, color=colors) ax4.set_ylabel('Memory Increase (GB)') ax4.set_title('Memory Increase Comparison') ax4.grid(True, alpha=0.3) # Add values on bars for bar, mem_inc in zip(bars4, memory_increase): height = bar.get_height() ax4.text(bar.get_x() + bar.get_width()/2., height + height*0.01, f'{mem_inc:.3f}GB', ha='center', va='bottom') plt.tight_layout() plt.savefig('small_moe_comparison.png', dpi=150, bbox_inches='tight') plt.show() # Print summary table print("\n" + "="*60) print("PERFORMANCE COMPARISON SUMMARY") print("="*60) print(f"{'Metric':<25} {'GPT-OSS':<15} {'MegaBlocks':<15} {'Winner':<10}") print("-" * 60) # Determine winners latency_winner = "GPT-OSS" if times[0] < times[1] else "MegaBlocks" throughput_winner = "GPT-OSS" if throughputs[0] > throughputs[1] else "MegaBlocks" memory_winner = "GPT-OSS" if memory_usage[0] < memory_usage[1] else "MegaBlocks" mem_inc_winner = "GPT-OSS" if memory_increase[0] < memory_increase[1] else "MegaBlocks" print(f"{'Latency (ms)':<25} {times[0]:<15.2f} {times[1]:<15.2f} {latency_winner:<10}") print(f"{'Throughput (tok/s)':<25} {throughputs[0]:<15.0f} {throughputs[1]:<15.0f} {throughput_winner:<10}") print(f"{'Memory Usage (GB)':<25} {memory_usage[0]:<15.3f} {memory_usage[1]:<15.3f} {memory_winner:<10}") print(f"{'Memory Increase (GB)':<25} {memory_increase[0]:<15.3f} {memory_increase[1]:<15.3f} {mem_inc_winner:<10}") # Speed ratio speed_ratio = times[1] / times[0] if times[0] < times[1] else times[0] / times[1] faster_impl = latency_winner print(f"\n{faster_impl} is {speed_ratio:.2f}x faster") # Throughput ratio throughput_ratio = max(throughputs) / min(throughputs) higher_throughput = throughput_winner print(f"{higher_throughput} has {throughput_ratio:.2f}x higher throughput") print("="*60) |
Conclusion
This focused benchmark compares the GPT-OSS (non-training mode) and MegaBlocks MoE implementations on the same hardware with identical weights and inputs. The comparison focuses on:
- Latency: Average forward pass time
- Throughput: Tokens processed per second
- Memory Usage: GPU memory consumption
- Memory Efficiency: Memory increase during execution
Both implementations use:
- 64 experts with top-2 routing
- 768 hidden dimensions
- Batch size of 8, sequence length of 512
- bfloat16 precision
- Identical pre-generated weights for fair comparison
The results show the performance characteristics of each approach, helping identify the optimal implementation for different use cases.