ROCm · Micky774 · Mar 16, 2026 · Mar 17, 2026 · Mar 18, 2026 · Mar 19, 2026
@@ -58,4 +58,5 @@ artifacts/
 **/times.csv
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
+.bench-results/
 *.DS_Store
@@ -0,0 +1,113 @@
+# Microbenchmarks for TransformerEngine
+
+GPU microbenchmarks driven by `driver.py`. Each `bench_*.py` file defines one
+or more bench classes following an ASV-style API (`params`, `param_names`,
+`time_*` methods, optional `work_*` companions). Timing uses
+`torch.utils.benchmark.Timer` under the hood. The driver runs each suite
+in-process and writes results as long-format CSV — one row per Timer block —
+intended to be consumed by a separate analysis tool (statistical tests,
+cross-run comparison).
+
+## Prerequisites
+
+- TransformerEngine must already be built and installed in the current Python environment.
+- A ROCm or CUDA GPU must be available.
+
+## Running benchmarks
+
+Each `bench_*.py` file is directly executable, or you can drive them through
+`driver.py`. Results are written by default to
+`benchmarks/.bench-results/<machine>/<commit-short>.csv`.
+
+```bash
+cd benchmarks/microbench
+python driver.py --all                      # run every suite
+python driver.py bench_gemm                 # run one suite via driver
+python bench_gemm.py                        # run one suite directly
+python bench_gemm.py time_forward           # filter to method names containing this string
+python bench_casting.py --no-csv            # stdout only, don't write CSV
+python bench_casting.py --csv out.csv       # custom output path
+```
+
+## Output format
+
+Long-format CSV — one row per `torch.utils.benchmark` block. Default location
+is `benchmarks/.bench-results/<machine>/<commit-short>.csv`; the
+`.bench-results` tree is in `.gitignore`. Schema:
+
+| Column | Type | Description |
+|---|---|---|
+| `suite` | str | Module name (e.g. `bench_gemm`) |
+| `class` | str | Bench class name (e.g. `BenchGemm`) |
+| `method` | str | Timed method (e.g. `time_forward`) |
+| `params` | str | `k1=v1;k2=v2` canonical form for joining across runs |
+| `sample_idx` | int | Block index within this Measurement |
+| `time_s` | float | Per-call elapsed seconds (Timer normalizes by `number_per_run`) |
+| `number_per_run` | int | Kernel invocations averaged into this row's `time_s` |
+| `tflops` | float | Per-call throughput, empty if no `work_*` flops |
+| `gbps` | float | Per-call bandwidth, empty if no `work_*` bytes |
+| `commit` | str | Short git HEAD hash |
+| `machine` | str | `platform.node()` |
+| `started_at_ms` | int | Unix-ms timestamp when this method's run began |
+
+Per-PR comparison and statistical tests are handled by a separate analysis
+tool (TBD) that reads two or more of these CSVs and joins on
+`(suite, class, method, params)`. Note that `time_s` is a *block mean* —
+the analysis tool should weight by `number_per_run` (or use blocks as
+independent samples) when computing significance.
+
+## Writing new benchmarks
+
+Create a new file in `benchmarks/microbench/` following the naming convention `bench_<name>.py`.
+
+```python
+#!/usr/bin/env python3
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+
+
+class BenchSomething:
+    params = [[1024, 4096], ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+    timeout = 300  # seconds, per parameter combination
+
+    def setup(self, M, config):
+        # Allocate tensors, create modules.
+        # Runs once per (combo, method); same instance is reused for warmup
+        # and timed Timer blocks.
+        self.module = ...
+        self.x = ...
+
+    def time_forward(self, M, config):
+        return time_func(lambda: self.module(self.x))
+
+    def time_forward_backward(self, M, config):
+        def fn():
+            out = self.module(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+    # Optional: define work_<name> to get throughput columns (TFLOPS / GB/s).
+    def work_forward(self, M, config):
+        return {"flops": 2 * M * self.N * self.K}   # compute-bound
+        # return {"bytes": M * self.hidden * 4}     # memory-bound
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
+```
+
+Key rules:
+- Method names starting with `time_` are automatically timed.
+- `time_*` methods must return `time_func(fn)` — a `torch.utils.benchmark.Measurement`.
+- Inside `fn`, do whatever per-call work you want measured. For backward,
+  let gradients accumulate in-place across iterations — Timer's repeated
+  invocations don't OOM (grads accumulate into the same tensor) and the
+  numerical correctness of accumulated grad doesn't affect timing.
+- Optionally define `work_<name>` companions to get TFLOPS or GB/s columns.
+  Return per-call work; the driver derives per-sample throughput.
+- The `params` list defines a cross-product; keep the matrix size reasonable.
+
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Attention micro-benchmark using te.DotProductAttention.
+
+Benchmarks fused multi-head attention (with flash attention backend) for
+model configurations with grouped-query attention (GQA).
+
+Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim
+  (two matmuls: Q@K^T and attn@V, each contributing 2*b*h*s^2*d)
+Backward FLOPs = 2 * Forward FLOPs (approximately)
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import M_SIZES, attention_configs
+
+BATCH = 2
+
+# Default to the shared dense-model configs; mutate this dict to add custom
+# attention shapes (e.g. CONFIGS["MyModel"] = (qh, kvh, head_dim, tp)).
+CONFIGS = attention_configs()
+
+
+class BenchAttention:
+    params = [M_SIZES, list(CONFIGS)]
+    param_names = ["seq_len", "model"]
+    timeout = 300
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = CONFIGS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def work_forward(self, seq_len, model):
+        n_q, _n_kv, hd, tp = CONFIGS[model]
+        qh = n_q // tp
+        return {"flops": 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def work_forward_backward(self, seq_len, model):
+        n_q, _n_kv, hd, tp = CONFIGS[model]
+        qh = n_q // tp
+        return {"flops": 3 * 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def time_forward(self, seq_len, model):
+        return time_func(lambda: self.attn(self.q, self.k, self.v))
+
+    def time_forward_backward(self, seq_len, model):
+        def fn():
+            out = self.attn(self.q, self.k, self.v)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
+both E4M3 (activations/weights) and E5M2 (gradients) formats.
+
+Shapes are (M, hidden_size) matching activation tensors from the shared
+dense-model configs. These casts are memory-bound; we report GB/s.
+"""
+
+import torch
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
+
+from driver import time_func
+from shapes import M_SIZES, hidden_sizes
+
+# Default to the shared per-architecture hidden sizes; mutate to add custom
+# entries (e.g. HIDDEN_SIZES["MyModel"] = 5120).
+HIDDEN_SIZES = hidden_sizes()
+
+CAST_CONFIGS = {
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
+}
+
+
+class BenchCasting:
+    params = [M_SIZES, list(HIDDEN_SIZES), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+    timeout = 120
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        self.direction = direction
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype,
+            device=torch.device("cuda"),
+            rowwise=True,
+            columnwise=False,
+        )
+        if direction == "dequantize":
+            bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.x = quantizer.quantize(bf16_tensor)
+        else:
+            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.quantizer = quantizer
+
+    def work_cast(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        # Read input (1B FP8 or 2B BF16) + write output + scale (~hidden bytes total)
+        # Approximated as 3 bytes per element either direction.
+        return {"bytes": M * hidden * 3}
+
+    def time_cast(self, M, model, cast):
+        if self.direction == "quantize":
+            return time_func(lambda: self.quantizer.quantize(self.x))
+        return time_func(lambda: self.x.dequantize(dtype=torch.bfloat16))
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+GEMM shapes derived from transformer layer projections:
+  QKV, AttnOut, GateUp (SwiGLU), Down.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import M_SIZES, gemm_shapes
+
+# Default to the shared dense-model projection shapes; mutate this dict to
+# add custom shapes (e.g. SHAPES["MyModel-QKV"] = (N, K)).
+SHAPES = gemm_shapes()
+
+
+class BenchGemm:
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        return time_func(lambda: self.linear(self.x))
+
+    def time_forward_backward(self, M, shape):
+        def fn():
+            out = self.linear(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 quantized compute. Each model
+contributes four GEMM shapes:
+  QKV projection     (column-parallel)  N = (Qheads + 2*KVheads)*head_dim / TP, K = hidden
+  Attention output   (row-parallel)     N = hidden, K = Qheads*head_dim / TP
+  MLP Gate+Up        (column-parallel)  N = 2*intermediate / TP, K = hidden  (SwiGLU)
+  MLP Down           (row-parallel)     N = hidden, K = intermediate / TP
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+from driver import time_func
+from shapes import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8:
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        def fn():
+            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+                self.linear(self.x)
+        return time_func(fn)
+
+    def time_forward_backward(self, M, shape):
+        def fn():
+            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+                out = self.linear(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)