diff --git a/.gitignore b/.gitignore index a1da56aa9..726bf6f9e 100644 --- a/.gitignore +++ b/.gitignore @@ -58,4 +58,5 @@ artifacts/ **/times.csv transformer_engine/build_info.txt transformer_engine/common/util/hip_nvml.* +.bench-results/ *.DS_Store diff --git a/benchmarks/microbench/README.md b/benchmarks/microbench/README.md new file mode 100644 index 000000000..fbf3bdf95 --- /dev/null +++ b/benchmarks/microbench/README.md @@ -0,0 +1,113 @@ +# Microbenchmarks for TransformerEngine + +GPU microbenchmarks driven by `driver.py`. Each `bench_*.py` file defines one +or more bench classes following an ASV-style API (`params`, `param_names`, +`time_*` methods, optional `work_*` companions). Timing uses +`torch.utils.benchmark.Timer` under the hood. The driver runs each suite +in-process and writes results as long-format CSV — one row per Timer block — +intended to be consumed by a separate analysis tool (statistical tests, +cross-run comparison). + +## Prerequisites + +- TransformerEngine must already be built and installed in the current Python environment. +- A ROCm or CUDA GPU must be available. + +## Running benchmarks + +Each `bench_*.py` file is directly executable, or you can drive them through +`driver.py`. Results are written by default to +`benchmarks/.bench-results//.csv`. + +```bash +cd benchmarks/microbench +python driver.py --all # run every suite +python driver.py bench_gemm # run one suite via driver +python bench_gemm.py # run one suite directly +python bench_gemm.py time_forward # filter to method names containing this string +python bench_casting.py --no-csv # stdout only, don't write CSV +python bench_casting.py --csv out.csv # custom output path +``` + +## Output format + +Long-format CSV — one row per `torch.utils.benchmark` block. Default location +is `benchmarks/.bench-results//.csv`; the +`.bench-results` tree is in `.gitignore`. Schema: + +| Column | Type | Description | +|---|---|---| +| `suite` | str | Module name (e.g. `bench_gemm`) | +| `class` | str | Bench class name (e.g. `BenchGemm`) | +| `method` | str | Timed method (e.g. `time_forward`) | +| `params` | str | `k1=v1;k2=v2` canonical form for joining across runs | +| `sample_idx` | int | Block index within this Measurement | +| `time_s` | float | Per-call elapsed seconds (Timer normalizes by `number_per_run`) | +| `number_per_run` | int | Kernel invocations averaged into this row's `time_s` | +| `tflops` | float | Per-call throughput, empty if no `work_*` flops | +| `gbps` | float | Per-call bandwidth, empty if no `work_*` bytes | +| `commit` | str | Short git HEAD hash | +| `machine` | str | `platform.node()` | +| `started_at_ms` | int | Unix-ms timestamp when this method's run began | + +Per-PR comparison and statistical tests are handled by a separate analysis +tool (TBD) that reads two or more of these CSVs and joins on +`(suite, class, method, params)`. Note that `time_s` is a *block mean* — +the analysis tool should weight by `number_per_run` (or use blocks as +independent samples) when computing significance. + +## Writing new benchmarks + +Create a new file in `benchmarks/microbench/` following the naming convention `bench_.py`. + +```python +#!/usr/bin/env python3 +import torch +import transformer_engine.pytorch as te + +from driver import time_func + + +class BenchSomething: + params = [[1024, 4096], ["config_a", "config_b"]] + param_names = ["M", "config"] + timeout = 300 # seconds, per parameter combination + + def setup(self, M, config): + # Allocate tensors, create modules. + # Runs once per (combo, method); same instance is reused for warmup + # and timed Timer blocks. + self.module = ... + self.x = ... + + def time_forward(self, M, config): + return time_func(lambda: self.module(self.x)) + + def time_forward_backward(self, M, config): + def fn(): + out = self.module(self.x) + out.backward(self.grad_out) + return time_func(fn) + + # Optional: define work_ to get throughput columns (TFLOPS / GB/s). + def work_forward(self, M, config): + return {"flops": 2 * M * self.N * self.K} # compute-bound + # return {"bytes": M * self.hidden * 4} # memory-bound + + +if __name__ == "__main__": + from driver import main + main(__file__) +``` + +Key rules: +- Method names starting with `time_` are automatically timed. +- `time_*` methods must return `time_func(fn)` — a `torch.utils.benchmark.Measurement`. +- Inside `fn`, do whatever per-call work you want measured. For backward, + let gradients accumulate in-place across iterations — Timer's repeated + invocations don't OOM (grads accumulate into the same tensor) and the + numerical correctness of accumulated grad doesn't affect timing. +- Optionally define `work_` companions to get TFLOPS or GB/s columns. + Return per-call work; the driver derives per-sample throughput. +- The `params` list defines a cross-product; keep the matrix size reasonable. + diff --git a/benchmarks/microbench/__init__.py b/benchmarks/microbench/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/microbench/bench_attention.py b/benchmarks/microbench/bench_attention.py new file mode 100644 index 000000000..cbdb70ef5 --- /dev/null +++ b/benchmarks/microbench/bench_attention.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Attention micro-benchmark using te.DotProductAttention. + +Benchmarks fused multi-head attention (with flash attention backend) for +model configurations with grouped-query attention (GQA). + +Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim + (two matmuls: Q@K^T and attn@V, each contributing 2*b*h*s^2*d) +Backward FLOPs = 2 * Forward FLOPs (approximately) +""" + +import torch +import transformer_engine.pytorch as te + +from driver import time_func +from shapes import M_SIZES, attention_configs + +BATCH = 2 + +# Default to the shared dense-model configs; mutate this dict to add custom +# attention shapes (e.g. CONFIGS["MyModel"] = (qh, kvh, head_dim, tp)). +CONFIGS = attention_configs() + + +class BenchAttention: + params = [M_SIZES, list(CONFIGS)] + param_names = ["seq_len", "model"] + timeout = 300 + + def setup(self, seq_len, model): + n_q, n_kv, hd, tp = CONFIGS[model] + qh, kvh = n_q // tp, n_kv // tp + dtype = torch.bfloat16 + + self.attn = te.DotProductAttention( + num_attention_heads=qh, kv_channels=hd, + num_gqa_groups=kvh, attn_mask_type="causal", + ).to(device="cuda", dtype=dtype) + + self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v)) + + def work_forward(self, seq_len, model): + n_q, _n_kv, hd, tp = CONFIGS[model] + qh = n_q // tp + return {"flops": 4 * BATCH * qh * seq_len * seq_len * hd} + + def work_forward_backward(self, seq_len, model): + n_q, _n_kv, hd, tp = CONFIGS[model] + qh = n_q // tp + return {"flops": 3 * 4 * BATCH * qh * seq_len * seq_len * hd} + + def time_forward(self, seq_len, model): + return time_func(lambda: self.attn(self.q, self.k, self.v)) + + def time_forward_backward(self, seq_len, model): + def fn(): + out = self.attn(self.q, self.k, self.v) + out.backward(self.grad_out) + return time_func(fn) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/bench_casting.py b/benchmarks/microbench/bench_casting.py new file mode 100644 index 000000000..1b9e9f9ee --- /dev/null +++ b/benchmarks/microbench/bench_casting.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for +both E4M3 (activations/weights) and E5M2 (gradients) formats. + +Shapes are (M, hidden_size) matching activation tensors from the shared +dense-model configs. These casts are memory-bound; we report GB/s. +""" + +import torch +from transformer_engine.pytorch import Float8CurrentScalingQuantizer +from transformer_engine_torch import DType as TE_DType + +from driver import time_func +from shapes import M_SIZES, hidden_sizes + +# Default to the shared per-architecture hidden sizes; mutate to add custom +# entries (e.g. HIDDEN_SIZES["MyModel"] = 5120). +HIDDEN_SIZES = hidden_sizes() + +CAST_CONFIGS = { + "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3), + "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3), + "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2), + "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2), +} + + +class BenchCasting: + params = [M_SIZES, list(HIDDEN_SIZES), list(CAST_CONFIGS)] + param_names = ["M", "model", "cast"] + timeout = 120 + + def setup(self, M, model, cast): + hidden = HIDDEN_SIZES[model] + direction, fp8_dtype = CAST_CONFIGS[cast] + self.direction = direction + quantizer = Float8CurrentScalingQuantizer( + fp8_dtype=fp8_dtype, + device=torch.device("cuda"), + rowwise=True, + columnwise=False, + ) + if direction == "dequantize": + bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda") + self.x = quantizer.quantize(bf16_tensor) + else: + self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda") + self.quantizer = quantizer + + def work_cast(self, M, model, cast): + hidden = HIDDEN_SIZES[model] + # Read input (1B FP8 or 2B BF16) + write output + scale (~hidden bytes total) + # Approximated as 3 bytes per element either direction. + return {"bytes": M * hidden * 3} + + def time_cast(self, M, model, cast): + if self.direction == "quantize": + return time_func(lambda: self.quantizer.quantize(self.x)) + return time_func(lambda: self.x.dequantize(dtype=torch.bfloat16)) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/bench_gemm.py b/benchmarks/microbench/bench_gemm.py new file mode 100644 index 000000000..9f6548a28 --- /dev/null +++ b/benchmarks/microbench/bench_gemm.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""BF16 GEMM benchmarks via te.Linear. + +GEMM shapes derived from transformer layer projections: + QKV, AttnOut, GateUp (SwiGLU), Down. +""" + +import torch +import transformer_engine.pytorch as te + +from driver import time_func +from shapes import M_SIZES, gemm_shapes + +# Default to the shared dense-model projection shapes; mutate this dict to +# add custom shapes (e.g. SHAPES["MyModel-QKV"] = (N, K)). +SHAPES = gemm_shapes() + + +class BenchGemm: + params = [M_SIZES, list(SHAPES)] + param_names = ["M", "shape"] + timeout = 300 + + def setup(self, M, shape): + N, K = SHAPES[shape] + dtype = torch.bfloat16 + self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.linear(self.x)) + + def work_forward(self, M, shape): + N, K = SHAPES[shape] + return {"flops": 2 * M * N * K} + + def work_forward_backward(self, M, shape): + N, K = SHAPES[shape] + return {"flops": 3 * 2 * M * N * K} + + def time_forward(self, M, shape): + return time_func(lambda: self.linear(self.x)) + + def time_forward_backward(self, M, shape): + def fn(): + out = self.linear(self.x) + out.backward(self.grad_out) + return time_func(fn) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/bench_gemm_fp8.py b/benchmarks/microbench/bench_gemm_fp8.py new file mode 100644 index 000000000..211ebca28 --- /dev/null +++ b/benchmarks/microbench/bench_gemm_fp8.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +FP8 GEMM benchmarks via te.Linear under fp8_autocast. + +Same shapes as bench_gemm.py but with FP8 quantized compute. Each model +contributes four GEMM shapes: + QKV projection (column-parallel) N = (Qheads + 2*KVheads)*head_dim / TP, K = hidden + Attention output (row-parallel) N = hidden, K = Qheads*head_dim / TP + MLP Gate+Up (column-parallel) N = 2*intermediate / TP, K = hidden (SwiGLU) + MLP Down (row-parallel) N = hidden, K = intermediate / TP +""" + +import torch +import transformer_engine.pytorch as te +from transformer_engine.common.recipe import DelayedScaling, Format + +from driver import time_func +from shapes import M_SIZES, gemm_shapes + +SHAPES = gemm_shapes() + +FP8_RECIPE = DelayedScaling( + fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max", +) + + +class BenchGemmFP8: + params = [M_SIZES, list(SHAPES)] + param_names = ["M", "shape"] + timeout = 300 + + def setup(self, M, shape): + N, K = SHAPES[shape] + dtype = torch.bfloat16 + self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda") + + def work_forward(self, M, shape): + N, K = SHAPES[shape] + return {"flops": 2 * M * N * K} + + def work_forward_backward(self, M, shape): + N, K = SHAPES[shape] + return {"flops": 3 * 2 * M * N * K} + + def time_forward(self, M, shape): + def fn(): + with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + self.linear(self.x) + return time_func(fn) + + def time_forward_backward(self, M, shape): + def fn(): + with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE): + out = self.linear(self.x) + out.backward(self.grad_out) + return time_func(fn) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/bench_grouped_gemm.py b/benchmarks/microbench/bench_grouped_gemm.py new file mode 100644 index 000000000..2fdc1283e --- /dev/null +++ b/benchmarks/microbench/bench_grouped_gemm.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Grouped GEMM benchmarks via te.GroupedLinear. + +MoE model configurations with GateUp and Down projections, swept over a +range of expert-parallel sizes. +""" + +import torch +import transformer_engine.pytorch as te + +from driver import time_func +from shapes import grouped_gemm_configs + +# Grouped GEMM scales with B, so we sweep smaller M than dense benchmarks +# to keep the working set and runtime reasonable. +M_SIZES = [512, 1024, 2048, 4096] + +# Default to the shared MoE configs; mutate to add custom shapes +# (e.g. CONFIGS["MyMoE_EP4-GateUp"] = (B, N, K)). +CONFIGS = grouped_gemm_configs() + + +class BenchGroupedGemm: + params = [M_SIZES, list(CONFIGS)] + param_names = ["M", "config"] + timeout = 300 + + def setup(self, M, config): + B, N, K = CONFIGS[config] + dtype = torch.bfloat16 + + self.module = te.GroupedLinear( + num_gemms=B, in_features=K, out_features=N, bias=False, + ).to(device="cuda", dtype=dtype) + + self.xs = [ + torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True) + for _ in range(B) + ] + outs = self.module(self.xs) + self.grad_outs = [torch.randn_like(o) for o in outs] + + def work_forward(self, M, config): + B, N, K = CONFIGS[config] + return {"flops": B * 2 * M * N * K} + + def work_forward_backward(self, M, config): + B, N, K = CONFIGS[config] + return {"flops": B * 3 * 2 * M * N * K} + + def time_forward(self, M, config): + return time_func(lambda: self.module(self.xs)) + + def time_forward_backward(self, M, config): + def fn(): + outs = self.module(self.xs) + torch.autograd.backward(outs, self.grad_outs) + return time_func(fn) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/bench_normalization.py b/benchmarks/microbench/bench_normalization.py new file mode 100644 index 000000000..ec8f9e9ef --- /dev/null +++ b/benchmarks/microbench/bench_normalization.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +""" +RMSNorm and LayerNorm benchmarks on activation-sized tensors. + +Modern models predominantly use RMSNorm, but we benchmark both since TE +supports both and they share the same kernel infrastructure. + +The M dimension (batch * seq_len) is swept across typical training sizes; +hidden sizes are derived from the shared dense-model configs. +""" + +import torch +import transformer_engine.pytorch as te + +from driver import time_func +from shapes import M_SIZES, hidden_sizes + +NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm} + +# Sweep unique hidden sizes from the shared dense-model configs; replace or +# extend (e.g. HIDDEN_SIZES.append(2048)) to add custom shapes. +HIDDEN_SIZES = sorted(set(hidden_sizes().values())) + + +class BenchNormalization: + params = [M_SIZES, HIDDEN_SIZES, list(NORMS)] + param_names = ["M", "hidden", "norm_type"] + timeout = 120 + + def setup(self, M, hidden, norm_type): + dtype = torch.bfloat16 + self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype) + self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True) + self.grad_out = torch.randn_like(self.norm(self.x)) + + def work_forward(self, M, hidden, norm_type): + # Read input (2B) + write output (2B) = 4 bytes per element + return {"bytes": M * hidden * 4} + + def work_forward_backward(self, M, hidden, norm_type): + # Fwd: read+write (4B), Bwd: read input+grad_out+write grad_in (6B) = 10B + return {"bytes": M * hidden * 10} + + def time_forward(self, M, hidden, norm_type): + return time_func(lambda: self.norm(self.x)) + + def time_forward_backward(self, M, hidden, norm_type): + def fn(): + out = self.norm(self.x) + out.backward(self.grad_out) + return time_func(fn) + + +if __name__ == "__main__": + from driver import main + main(__file__) diff --git a/benchmarks/microbench/compare.py b/benchmarks/microbench/compare.py new file mode 100644 index 000000000..8d562084a --- /dev/null +++ b/benchmarks/microbench/compare.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Compare two microbench CSVs produced by driver.py. + +Joins on (suite, class, method, params). For each group, reports baseline and +candidate medians plus a speedup; >1 means candidate is faster (for time) or +higher-throughput (for tflops/gbps). + +Usage: + python compare.py baseline.csv candidate.csv + python compare.py baseline.csv candidate.csv --metric tflops + python compare.py baseline.csv candidate.csv --sort speedup --top 20 +""" + +import argparse +import csv +import statistics +import sys +from collections import defaultdict + +KEY_COLS = ("suite", "class", "method", "params") + + +def load(path, metric): + """Group rows by KEY_COLS, returning {key: [float, ...]} of the metric values. + + Empty cells are skipped (e.g. tflops/gbps may be absent when no work_* is + defined). The metric is always coerced to float; non-numeric rows are + skipped silently. + """ + groups = defaultdict(list) + with open(path, newline="") as f: + for row in csv.DictReader(f): + val = row.get(metric, "") + if val == "" or val is None: + continue + try: + groups[tuple(row[k] for k in KEY_COLS)].append(float(val)) + except (ValueError, KeyError): + continue + return groups + + +def summarize(samples): + """Return (median, mean, n) for a list of samples, or None if empty.""" + if not samples: + return None + return statistics.median(samples), statistics.fmean(samples), len(samples) + + +def main(): + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("baseline_csv") + p.add_argument("candidate_csv") + p.add_argument("--metric", default="time_s", choices=["time_s", "tflops", "gbps"], + help="column to compare (default: time_s)") + p.add_argument("--sort", default="suite", choices=["suite", "speedup", "abs_change"], + help="row order (default: suite)") + p.add_argument("--top", type=int, default=None, + help="show only the top N rows after sorting") + p.add_argument("--min-samples", type=int, default=1, + help="skip groups with fewer than this many samples in either CSV") + args = p.parse_args() + + base = load(args.baseline_csv, args.metric) + cand = load(args.candidate_csv, args.metric) + + # For time: lower is better, so speedup = base / cand. + # For tflops/gbps: higher is better, so speedup = cand / base. + lower_is_better = (args.metric == "time_s") + + rows = [] + for key in sorted(base.keys() | cand.keys()): + b = summarize(base.get(key, [])) + c = summarize(cand.get(key, [])) + if b is None or c is None: + rows.append({"key": key, "status": "baseline-only" if c is None else "candidate-only", + "b": b, "c": c, "speedup": None}) + continue + if b[2] < args.min_samples or c[2] < args.min_samples: + continue + b_med, c_med = b[0], c[0] + if b_med <= 0 or c_med <= 0: + speedup = None + else: + speedup = (b_med / c_med) if lower_is_better else (c_med / b_med) + rows.append({"key": key, "status": "matched", "b": b, "c": c, "speedup": speedup}) + + matched = [r for r in rows if r["status"] == "matched" and r["speedup"] is not None] + only_b = [r for r in rows if r["status"] == "baseline-only"] + only_c = [r for r in rows if r["status"] == "candidate-only"] + + if args.sort == "speedup": + matched.sort(key=lambda r: r["speedup"]) + elif args.sort == "abs_change": + matched.sort(key=lambda r: -abs(r["speedup"] - 1.0)) + if args.top is not None: + matched = matched[:args.top] + + unit = "ms" if args.metric == "time_s" else args.metric + scale = 1e3 if args.metric == "time_s" else 1.0 + print(f"{'suite':<22} {'class':<22} {'method':<22} {'params':<48} " + f"{'base ' + unit:>12} {'cand ' + unit:>12} {'speedup':>9} n_b/n_c") + print("-" * 160) + for r in matched: + s, cls, m, params = r["key"] + b_med, c_med = r["b"][0] * scale, r["c"][0] * scale + n_b, n_c = r["b"][2], r["c"][2] + print(f"{s:<22} {cls:<22} {m:<22} {params:<48} " + f"{b_med:>12.4f} {c_med:>12.4f} {r['speedup']:>8.3f}x {n_b}/{n_c}") + + if matched: + speedups = [r["speedup"] for r in matched] + print() + print(f"{len(matched)} matched groups: " + f"median {statistics.median(speedups):.3f}x " + f"min {min(speedups):.3f}x " + f"max {max(speedups):.3f}x") + + for label, rows_list in [("baseline only", only_b), ("candidate only", only_c)]: + if not rows_list: + continue + print(f"\n{len(rows_list)} groups {label}:") + for r in rows_list: + print(" " + " | ".join(r["key"])) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/benchmarks/microbench/driver.py b/benchmarks/microbench/driver.py new file mode 100644 index 000000000..62d28324e --- /dev/null +++ b/benchmarks/microbench/driver.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Microbenchmark driver — runs Bench* classes via torch.utils.benchmark. + +Each bench file declares one or more ``Bench*`` classes with ``params``, +``param_names``, and ``time_*`` methods (optionally paired with ``work_*`` +companions returning ``{"flops": ...}`` or ``{"bytes": ...}``). The driver +runs them and writes long-format CSV (one row per Timer block). + +Usage: + python driver.py [method_filter] [--csv FILE | --no-csv] + python driver.py --all [method_filter] [--csv FILE | --no-csv] + python bench_gemm.py [method_filter] [--csv FILE | --no-csv] +""" + +import argparse +import csv +import glob +import importlib +import itertools +import os +import platform +import subprocess +import sys +import time + +import torch.utils.benchmark as benchmark + + +def time_func(fn, min_run_time=1.0): + """Time *fn* with torch.utils.benchmark.Timer (blocked_autorange).""" + return benchmark.Timer(stmt="fn()", globals={"fn": fn}).blocked_autorange( + min_run_time=min_run_time) + + +def _commit(): + try: + return subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL + ).decode().strip() + except Exception: + return "unknown" + + +CSV_COLUMNS = [ + "suite", "class", "method", "params", "sample_idx", "time_s", + "number_per_run", "tflops", "gbps", "commit", "machine", "started_at_ms", +] + + +def _default_csv_path(script_dir): + repo_root = os.path.abspath(os.path.join(script_dir, "..", "..")) + return os.path.join(repo_root, "benchmarks", ".bench-results", + platform.node() or "unknown", f"{_commit()}.csv") + + +def run_class(suite, cls, class_name, method_filter, commit, machine): + """Run all time_* methods of *cls* over the param cross-product.""" + methods = sorted(m for m in dir(cls) if m.startswith("time_") + and (not method_filter or method_filter in m)) + if not methods: + return [] + + param_names = getattr(cls, "param_names", []) + combos = list(itertools.product(*getattr(cls, "params", [[]]))) + + print(f"\n{class_name} ({len(combos)} combos x {len(methods)} methods)") + hdr = (f" {'median':>10} {'mean':>10} {'iqr':>10} " + f"{'TFLOPS':>8} {'GB/s':>8} {'method':<28} params") + print("-" * len(hdr)); print(hdr); print("-" * len(hdr)) + + def fmt(val): + return f"{val:>8.1f}" if val else f"{'':>8}" + + rows = [] + for method_name in methods: + started_at_ms = int(time.time() * 1000) + for combo in combos: + label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo)) + params_str = ";".join(f"{n}={v}" for n, v in zip(param_names, combo)) + inst = cls() + try: + inst.setup(*combo) + m = getattr(inst, method_name)(*combo) + except Exception as e: + print(f" SKIP {label} {method_name}: {e}") + continue + + wfn = getattr(inst, "work_" + method_name[5:], None) + work = wfn(*combo) if wfn else {} + flops, byts = work.get("flops"), work.get("bytes") + + for i, t in enumerate(m.times): + rows.append({ + "suite": suite, "class": class_name, "method": method_name, + "params": params_str, "sample_idx": i, "time_s": t, + "number_per_run": m.number_per_run, + "tflops": flops / t / 1e12 if flops and t > 0 else "", + "gbps": byts / t / 1e9 if byts and t > 0 else "", + "commit": commit, "machine": machine, + "started_at_ms": started_at_ms, + }) + + tflops = flops / m.median / 1e12 if flops and m.median > 0 else None + gbps = byts / m.median / 1e9 if byts and m.median > 0 else None + print(f" {m.median*1000:>8.3f}ms {m.mean*1000:>8.3f}ms " + f"{m.iqr*1000:>8.3f}ms {fmt(tflops)} {fmt(gbps)} " + f"{method_name:<28} {label}") + return rows + + +def main(caller_file=None): + parser = argparse.ArgumentParser( + description="Run microbenchmarks via torch.utils.benchmark.") + if caller_file is None: + parser.add_argument("suite", nargs="?", + help="bench module (e.g. bench_gemm)") + parser.add_argument("--all", action="store_true", + help="run all bench_*.py in this directory") + parser.add_argument("method_filter", nargs="?", default=None, + help="only run time_* methods containing this string") + parser.add_argument("--csv", default=None, metavar="FILE", + help="output CSV path " + "(default: benchmarks/.bench-results//.csv)") + parser.add_argument("--no-csv", action="store_true", + help="don't write CSV (stdout summary only)") + args = parser.parse_args() + + if caller_file is not None: + script_dir = os.path.dirname(os.path.abspath(caller_file)) + suites = [os.path.splitext(os.path.basename(caller_file))[0]] + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + if args.all: + suites = sorted(os.path.splitext(os.path.basename(f))[0] + for f in glob.glob(os.path.join(script_dir, "bench_*.py"))) + elif args.suite: + suites = [args.suite] + else: + parser.error("provide a suite name or use --all") + + os.chdir(script_dir) + if script_dir not in sys.path: + sys.path.insert(0, script_dir) + + commit, machine = _commit(), platform.node() or "unknown" + rows = [] + for suite in suites: + mod = importlib.import_module(suite) + for name in sorted(dir(mod)): + obj = getattr(mod, name) + if isinstance(obj, type) and name.startswith("Bench"): + rows.extend(run_class(suite, obj, name, args.method_filter, + commit, machine)) + + if rows and not args.no_csv: + path = args.csv or _default_csv_path(script_dir) + os.makedirs(os.path.dirname(os.path.abspath(path)) or ".", exist_ok=True) + with open(path, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=CSV_COLUMNS, extrasaction="ignore") + w.writeheader() + w.writerows(rows) + print(f"\nWrote {len(rows)} rows to {path}") + + +if __name__ == "__main__": + main() diff --git a/benchmarks/microbench/shapes.py b/benchmarks/microbench/shapes.py new file mode 100644 index 000000000..c4172aff9 --- /dev/null +++ b/benchmarks/microbench/shapes.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +############################################################################### +# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved. +# +# See LICENSE for license information. +############################################################################### +"""Shared model configs and shape helpers for microbenchmarks. + +Each helper returns a fresh dict — callers may mutate it (add custom entries, +drop unwanted ones) without affecting other benchmarks. Callers that want a +different set of base models can pass ``models=`` to override the default. + +Sources: + https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json + https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json + https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json + https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json + https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json + MoE configs: https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py +""" + +# Default token-count (batch * seq_len) sweep used by most benches. +M_SIZES = [1024, 2048, 4096, 8192] + +# Dense transformer configs: (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp) +DENSE_MODELS = { + "Llama3-8B_TP1": (4096, 14336, 32, 8, 128, 1), + "Llama3-8B_TP8": (4096, 14336, 32, 8, 128, 8), + "Llama3-70B_TP8": (8192, 28672, 64, 8, 128, 8), + "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8), + "Qwen2.5-7B_TP1": (3584, 18944, 28, 4, 128, 1), + "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8), +} + +# MoE configs: (n_routed_experts, moe_intermediate_size, hidden_size) +MOE_MODELS = { + "DSV2-Lite": (64, 1408, 2048), + "DSV2": (160, 1536, 5120), + "DSV3": (256, 2048, 7168), + "Grok-V2": (8, 16384, 8192), +} + +# Default expert-parallel sweep for grouped GEMM. +EP_SIZES = [32, 16, 8] + + +def gemm_shapes(models=None): + """Per-projection ``(N, K)`` shapes derived from dense transformer configs. + + Returns ``{"-QKV": (N, K), "-AttnOut": ..., "-GateUp", "-Down"}``. + """ + shapes = {} + for name, (h, inter, nq, nkv, hd, tp) in (models or DENSE_MODELS).items(): + shapes[f"{name}-QKV"] = ((nq * hd + 2 * nkv * hd) // tp, h) + shapes[f"{name}-AttnOut"] = (h, (nq * hd) // tp) + shapes[f"{name}-GateUp"] = ((2 * inter) // tp, h) + shapes[f"{name}-Down"] = (h, inter // tp) + return shapes + + +def attention_configs(models=None): + """Per-model attention configs as ``(num_q_heads, num_kv_heads, head_dim, tp)``.""" + return { + name: (nq, nkv, hd, tp) + for name, (_h, _i, nq, nkv, hd, tp) in (models or DENSE_MODELS).items() + } + + +def hidden_sizes(models=None): + """Per-architecture ``{name: hidden}`` mapping, deduplicated across TP variants.""" + out = {} + for name, (h, *_) in (models or DENSE_MODELS).items(): + base = name.split("_TP")[0] + out.setdefault(base, h) + return out + + +def grouped_gemm_configs(models=None, ep_sizes=None): + """Grouped GEMM ``(B, N, K)`` configs for MoE GateUp + Down projections. + + ``B = n_routed_experts // ep`` for each EP that divides ``n_routed_experts``; + other EPs are silently skipped. + """ + configs = {} + for name, (n_experts, inter, hidden) in (models or MOE_MODELS).items(): + for ep in (ep_sizes or EP_SIZES): + if n_experts % ep != 0: + continue + B = n_experts // ep + configs[f"{name}_EP{ep}-GateUp"] = (B, 2 * inter, hidden) + configs[f"{name}_EP{ep}-Down"] = (B, hidden, inter) + return configs