diff --git a/.gitignore b/.gitignore
index a1da56aa9..726bf6f9e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,4 +58,5 @@ artifacts/
 **/times.csv
 transformer_engine/build_info.txt
 transformer_engine/common/util/hip_nvml.*
+.bench-results/
 *.DS_Store
diff --git a/benchmarks/microbench/README.md b/benchmarks/microbench/README.md
new file mode 100644
index 000000000..fbf3bdf95
--- /dev/null
+++ b/benchmarks/microbench/README.md
@@ -0,0 +1,113 @@
+# Microbenchmarks for TransformerEngine
+
+GPU microbenchmarks driven by `driver.py`. Each `bench_*.py` file defines one
+or more bench classes following an ASV-style API (`params`, `param_names`,
+`time_*` methods, optional `work_*` companions). Timing uses
+`torch.utils.benchmark.Timer` under the hood. The driver runs each suite
+in-process and writes results as long-format CSV — one row per Timer block —
+intended to be consumed by a separate analysis tool (statistical tests,
+cross-run comparison).
+
+## Prerequisites
+
+- TransformerEngine must already be built and installed in the current Python environment.
+- A ROCm or CUDA GPU must be available.
+
+## Running benchmarks
+
+Each `bench_*.py` file is directly executable, or you can drive them through
+`driver.py`. Results are written by default to
+`benchmarks/.bench-results/<machine>/<commit-short>.csv`.
+
+```bash
+cd benchmarks/microbench
+python driver.py --all                      # run every suite
+python driver.py bench_gemm                 # run one suite via driver
+python bench_gemm.py                        # run one suite directly
+python bench_gemm.py time_forward           # filter to method names containing this string
+python bench_casting.py --no-csv            # stdout only, don't write CSV
+python bench_casting.py --csv out.csv       # custom output path
+```
+
+## Output format
+
+Long-format CSV — one row per `torch.utils.benchmark` block. Default location
+is `benchmarks/.bench-results/<machine>/<commit-short>.csv`; the
+`.bench-results` tree is in `.gitignore`. Schema:
+
+| Column | Type | Description |
+|---|---|---|
+| `suite` | str | Module name (e.g. `bench_gemm`) |
+| `class` | str | Bench class name (e.g. `BenchGemm`) |
+| `method` | str | Timed method (e.g. `time_forward`) |
+| `params` | str | `k1=v1;k2=v2` canonical form for joining across runs |
+| `sample_idx` | int | Block index within this Measurement |
+| `time_s` | float | Per-call elapsed seconds (Timer normalizes by `number_per_run`) |
+| `number_per_run` | int | Kernel invocations averaged into this row's `time_s` |
+| `tflops` | float | Per-call throughput, empty if no `work_*` flops |
+| `gbps` | float | Per-call bandwidth, empty if no `work_*` bytes |
+| `commit` | str | Short git HEAD hash |
+| `machine` | str | `platform.node()` |
+| `started_at_ms` | int | Unix-ms timestamp when this method's run began |
+
+Per-PR comparison and statistical tests are handled by a separate analysis
+tool (TBD) that reads two or more of these CSVs and joins on
+`(suite, class, method, params)`. Note that `time_s` is a *block mean* —
+the analysis tool should weight by `number_per_run` (or use blocks as
+independent samples) when computing significance.
+
+## Writing new benchmarks
+
+Create a new file in `benchmarks/microbench/` following the naming convention `bench_<name>.py`.
+
+```python
+#!/usr/bin/env python3
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+
+
+class BenchSomething:
+    params = [[1024, 4096], ["config_a", "config_b"]]
+    param_names = ["M", "config"]
+    timeout = 300  # seconds, per parameter combination
+
+    def setup(self, M, config):
+        # Allocate tensors, create modules.
+        # Runs once per (combo, method); same instance is reused for warmup
+        # and timed Timer blocks.
+        self.module = ...
+        self.x = ...
+
+    def time_forward(self, M, config):
+        return time_func(lambda: self.module(self.x))
+
+    def time_forward_backward(self, M, config):
+        def fn():
+            out = self.module(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+    # Optional: define work_<name> to get throughput columns (TFLOPS / GB/s).
+    def work_forward(self, M, config):
+        return {"flops": 2 * M * self.N * self.K}   # compute-bound
+        # return {"bytes": M * self.hidden * 4}     # memory-bound
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
+```
+
+Key rules:
+- Method names starting with `time_` are automatically timed.
+- `time_*` methods must return `time_func(fn)` — a `torch.utils.benchmark.Measurement`.
+- Inside `fn`, do whatever per-call work you want measured. For backward,
+  let gradients accumulate in-place across iterations — Timer's repeated
+  invocations don't OOM (grads accumulate into the same tensor) and the
+  numerical correctness of accumulated grad doesn't affect timing.
+- Optionally define `work_<name>` companions to get TFLOPS or GB/s columns.
+  Return per-call work; the driver derives per-sample throughput.
+- The `params` list defines a cross-product; keep the matrix size reasonable.
+
diff --git a/benchmarks/microbench/__init__.py b/benchmarks/microbench/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/microbench/bench_attention.py b/benchmarks/microbench/bench_attention.py
new file mode 100644
index 000000000..cbdb70ef5
--- /dev/null
+++ b/benchmarks/microbench/bench_attention.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Attention micro-benchmark using te.DotProductAttention.
+
+Benchmarks fused multi-head attention (with flash attention backend) for
+model configurations with grouped-query attention (GQA).
+
+Forward FLOPs = 4 * batch * num_q_heads * seq_len^2 * head_dim
+  (two matmuls: Q@K^T and attn@V, each contributing 2*b*h*s^2*d)
+Backward FLOPs = 2 * Forward FLOPs (approximately)
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import M_SIZES, attention_configs
+
+BATCH = 2
+
+# Default to the shared dense-model configs; mutate this dict to add custom
+# attention shapes (e.g. CONFIGS["MyModel"] = (qh, kvh, head_dim, tp)).
+CONFIGS = attention_configs()
+
+
+class BenchAttention:
+    params = [M_SIZES, list(CONFIGS)]
+    param_names = ["seq_len", "model"]
+    timeout = 300
+
+    def setup(self, seq_len, model):
+        n_q, n_kv, hd, tp = CONFIGS[model]
+        qh, kvh = n_q // tp, n_kv // tp
+        dtype = torch.bfloat16
+
+        self.attn = te.DotProductAttention(
+            num_attention_heads=qh, kv_channels=hd,
+            num_gqa_groups=kvh, attn_mask_type="causal",
+        ).to(device="cuda", dtype=dtype)
+
+        self.q = torch.randn(seq_len, BATCH, qh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.k = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.v = torch.randn(seq_len, BATCH, kvh, hd, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.attn(self.q, self.k, self.v))
+
+    def work_forward(self, seq_len, model):
+        n_q, _n_kv, hd, tp = CONFIGS[model]
+        qh = n_q // tp
+        return {"flops": 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def work_forward_backward(self, seq_len, model):
+        n_q, _n_kv, hd, tp = CONFIGS[model]
+        qh = n_q // tp
+        return {"flops": 3 * 4 * BATCH * qh * seq_len * seq_len * hd}
+
+    def time_forward(self, seq_len, model):
+        return time_func(lambda: self.attn(self.q, self.k, self.v))
+
+    def time_forward_backward(self, seq_len, model):
+        def fn():
+            out = self.attn(self.q, self.k, self.v)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/bench_casting.py b/benchmarks/microbench/bench_casting.py
new file mode 100644
index 000000000..1b9e9f9ee
--- /dev/null
+++ b/benchmarks/microbench/bench_casting.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+Benchmarks quantization (BF16 -> FP8) and dequantization (FP8 -> BF16) for
+both E4M3 (activations/weights) and E5M2 (gradients) formats.
+
+Shapes are (M, hidden_size) matching activation tensors from the shared
+dense-model configs. These casts are memory-bound; we report GB/s.
+"""
+
+import torch
+from transformer_engine.pytorch import Float8CurrentScalingQuantizer
+from transformer_engine_torch import DType as TE_DType
+
+from driver import time_func
+from shapes import M_SIZES, hidden_sizes
+
+# Default to the shared per-architecture hidden sizes; mutate to add custom
+# entries (e.g. HIDDEN_SIZES["MyModel"] = 5120).
+HIDDEN_SIZES = hidden_sizes()
+
+CAST_CONFIGS = {
+    "BF16_to_E4M3": ("quantize", TE_DType.kFloat8E4M3),
+    "E4M3_to_BF16": ("dequantize", TE_DType.kFloat8E4M3),
+    "BF16_to_E5M2": ("quantize", TE_DType.kFloat8E5M2),
+    "E5M2_to_BF16": ("dequantize", TE_DType.kFloat8E5M2),
+}
+
+
+class BenchCasting:
+    params = [M_SIZES, list(HIDDEN_SIZES), list(CAST_CONFIGS)]
+    param_names = ["M", "model", "cast"]
+    timeout = 120
+
+    def setup(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        direction, fp8_dtype = CAST_CONFIGS[cast]
+        self.direction = direction
+        quantizer = Float8CurrentScalingQuantizer(
+            fp8_dtype=fp8_dtype,
+            device=torch.device("cuda"),
+            rowwise=True,
+            columnwise=False,
+        )
+        if direction == "dequantize":
+            bf16_tensor = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.x = quantizer.quantize(bf16_tensor)
+        else:
+            self.x = torch.randn(M, hidden, dtype=torch.bfloat16, device="cuda")
+            self.quantizer = quantizer
+
+    def work_cast(self, M, model, cast):
+        hidden = HIDDEN_SIZES[model]
+        # Read input (1B FP8 or 2B BF16) + write output + scale (~hidden bytes total)
+        # Approximated as 3 bytes per element either direction.
+        return {"bytes": M * hidden * 3}
+
+    def time_cast(self, M, model, cast):
+        if self.direction == "quantize":
+            return time_func(lambda: self.quantizer.quantize(self.x))
+        return time_func(lambda: self.x.dequantize(dtype=torch.bfloat16))
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/bench_gemm.py b/benchmarks/microbench/bench_gemm.py
new file mode 100644
index 000000000..9f6548a28
--- /dev/null
+++ b/benchmarks/microbench/bench_gemm.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""BF16 GEMM benchmarks via te.Linear.
+
+GEMM shapes derived from transformer layer projections:
+  QKV, AttnOut, GateUp (SwiGLU), Down.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import M_SIZES, gemm_shapes
+
+# Default to the shared dense-model projection shapes; mutate this dict to
+# add custom shapes (e.g. SHAPES["MyModel-QKV"] = (N, K)).
+SHAPES = gemm_shapes()
+
+
+class BenchGemm:
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.linear(self.x))
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        return time_func(lambda: self.linear(self.x))
+
+    def time_forward_backward(self, M, shape):
+        def fn():
+            out = self.linear(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/bench_gemm_fp8.py b/benchmarks/microbench/bench_gemm_fp8.py
new file mode 100644
index 000000000..211ebca28
--- /dev/null
+++ b/benchmarks/microbench/bench_gemm_fp8.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+FP8 GEMM benchmarks via te.Linear under fp8_autocast.
+
+Same shapes as bench_gemm.py but with FP8 quantized compute. Each model
+contributes four GEMM shapes:
+  QKV projection     (column-parallel)  N = (Qheads + 2*KVheads)*head_dim / TP, K = hidden
+  Attention output   (row-parallel)     N = hidden, K = Qheads*head_dim / TP
+  MLP Gate+Up        (column-parallel)  N = 2*intermediate / TP, K = hidden  (SwiGLU)
+  MLP Down           (row-parallel)     N = hidden, K = intermediate / TP
+"""
+
+import torch
+import transformer_engine.pytorch as te
+from transformer_engine.common.recipe import DelayedScaling, Format
+
+from driver import time_func
+from shapes import M_SIZES, gemm_shapes
+
+SHAPES = gemm_shapes()
+
+FP8_RECIPE = DelayedScaling(
+    fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max",
+)
+
+
+class BenchGemmFP8:
+    params = [M_SIZES, list(SHAPES)]
+    param_names = ["M", "shape"]
+    timeout = 300
+
+    def setup(self, M, shape):
+        N, K = SHAPES[shape]
+        dtype = torch.bfloat16
+        self.linear = te.Linear(K, N, bias=False).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn(M, N, dtype=dtype, device="cuda")
+
+    def work_forward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 2 * M * N * K}
+
+    def work_forward_backward(self, M, shape):
+        N, K = SHAPES[shape]
+        return {"flops": 3 * 2 * M * N * K}
+
+    def time_forward(self, M, shape):
+        def fn():
+            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+                self.linear(self.x)
+        return time_func(fn)
+
+    def time_forward_backward(self, M, shape):
+        def fn():
+            with te.fp8_autocast(enabled=True, fp8_recipe=FP8_RECIPE):
+                out = self.linear(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/bench_grouped_gemm.py b/benchmarks/microbench/bench_grouped_gemm.py
new file mode 100644
index 000000000..2fdc1283e
--- /dev/null
+++ b/benchmarks/microbench/bench_grouped_gemm.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Grouped GEMM benchmarks via te.GroupedLinear.
+
+MoE model configurations with GateUp and Down projections, swept over a
+range of expert-parallel sizes.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import grouped_gemm_configs
+
+# Grouped GEMM scales with B, so we sweep smaller M than dense benchmarks
+# to keep the working set and runtime reasonable.
+M_SIZES = [512, 1024, 2048, 4096]
+
+# Default to the shared MoE configs; mutate to add custom shapes
+# (e.g. CONFIGS["MyMoE_EP4-GateUp"] = (B, N, K)).
+CONFIGS = grouped_gemm_configs()
+
+
+class BenchGroupedGemm:
+    params = [M_SIZES, list(CONFIGS)]
+    param_names = ["M", "config"]
+    timeout = 300
+
+    def setup(self, M, config):
+        B, N, K = CONFIGS[config]
+        dtype = torch.bfloat16
+
+        self.module = te.GroupedLinear(
+            num_gemms=B, in_features=K, out_features=N, bias=False,
+        ).to(device="cuda", dtype=dtype)
+
+        self.xs = [
+            torch.randn(M, K, dtype=dtype, device="cuda", requires_grad=True)
+            for _ in range(B)
+        ]
+        outs = self.module(self.xs)
+        self.grad_outs = [torch.randn_like(o) for o in outs]
+
+    def work_forward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 2 * M * N * K}
+
+    def work_forward_backward(self, M, config):
+        B, N, K = CONFIGS[config]
+        return {"flops": B * 3 * 2 * M * N * K}
+
+    def time_forward(self, M, config):
+        return time_func(lambda: self.module(self.xs))
+
+    def time_forward_backward(self, M, config):
+        def fn():
+            outs = self.module(self.xs)
+            torch.autograd.backward(outs, self.grad_outs)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/bench_normalization.py b/benchmarks/microbench/bench_normalization.py
new file mode 100644
index 000000000..ec8f9e9ef
--- /dev/null
+++ b/benchmarks/microbench/bench_normalization.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""
+RMSNorm and LayerNorm benchmarks on activation-sized tensors.
+
+Modern models predominantly use RMSNorm, but we benchmark both since TE
+supports both and they share the same kernel infrastructure.
+
+The M dimension (batch * seq_len) is swept across typical training sizes;
+hidden sizes are derived from the shared dense-model configs.
+"""
+
+import torch
+import transformer_engine.pytorch as te
+
+from driver import time_func
+from shapes import M_SIZES, hidden_sizes
+
+NORMS = {"RMSNorm": te.RMSNorm, "LayerNorm": te.LayerNorm}
+
+# Sweep unique hidden sizes from the shared dense-model configs; replace or
+# extend (e.g. HIDDEN_SIZES.append(2048)) to add custom shapes.
+HIDDEN_SIZES = sorted(set(hidden_sizes().values()))
+
+
+class BenchNormalization:
+    params = [M_SIZES, HIDDEN_SIZES, list(NORMS)]
+    param_names = ["M", "hidden", "norm_type"]
+    timeout = 120
+
+    def setup(self, M, hidden, norm_type):
+        dtype = torch.bfloat16
+        self.norm = NORMS[norm_type](hidden).to(device="cuda", dtype=dtype)
+        self.x = torch.randn(M, hidden, dtype=dtype, device="cuda", requires_grad=True)
+        self.grad_out = torch.randn_like(self.norm(self.x))
+
+    def work_forward(self, M, hidden, norm_type):
+        # Read input (2B) + write output (2B) = 4 bytes per element
+        return {"bytes": M * hidden * 4}
+
+    def work_forward_backward(self, M, hidden, norm_type):
+        # Fwd: read+write (4B), Bwd: read input+grad_out+write grad_in (6B) = 10B
+        return {"bytes": M * hidden * 10}
+
+    def time_forward(self, M, hidden, norm_type):
+        return time_func(lambda: self.norm(self.x))
+
+    def time_forward_backward(self, M, hidden, norm_type):
+        def fn():
+            out = self.norm(self.x)
+            out.backward(self.grad_out)
+        return time_func(fn)
+
+
+if __name__ == "__main__":
+    from driver import main
+    main(__file__)
diff --git a/benchmarks/microbench/compare.py b/benchmarks/microbench/compare.py
new file mode 100644
index 000000000..8d562084a
--- /dev/null
+++ b/benchmarks/microbench/compare.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Compare two microbench CSVs produced by driver.py.
+
+Joins on (suite, class, method, params). For each group, reports baseline and
+candidate medians plus a speedup; >1 means candidate is faster (for time) or
+higher-throughput (for tflops/gbps).
+
+Usage:
+    python compare.py baseline.csv candidate.csv
+    python compare.py baseline.csv candidate.csv --metric tflops
+    python compare.py baseline.csv candidate.csv --sort speedup --top 20
+"""
+
+import argparse
+import csv
+import statistics
+import sys
+from collections import defaultdict
+
+KEY_COLS = ("suite", "class", "method", "params")
+
+
+def load(path, metric):
+    """Group rows by KEY_COLS, returning {key: [float, ...]} of the metric values.
+
+    Empty cells are skipped (e.g. tflops/gbps may be absent when no work_* is
+    defined). The metric is always coerced to float; non-numeric rows are
+    skipped silently.
+    """
+    groups = defaultdict(list)
+    with open(path, newline="") as f:
+        for row in csv.DictReader(f):
+            val = row.get(metric, "")
+            if val == "" or val is None:
+                continue
+            try:
+                groups[tuple(row[k] for k in KEY_COLS)].append(float(val))
+            except (ValueError, KeyError):
+                continue
+    return groups
+
+
+def summarize(samples):
+    """Return (median, mean, n) for a list of samples, or None if empty."""
+    if not samples:
+        return None
+    return statistics.median(samples), statistics.fmean(samples), len(samples)
+
+
+def main():
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("baseline_csv")
+    p.add_argument("candidate_csv")
+    p.add_argument("--metric", default="time_s", choices=["time_s", "tflops", "gbps"],
+                   help="column to compare (default: time_s)")
+    p.add_argument("--sort", default="suite", choices=["suite", "speedup", "abs_change"],
+                   help="row order (default: suite)")
+    p.add_argument("--top", type=int, default=None,
+                   help="show only the top N rows after sorting")
+    p.add_argument("--min-samples", type=int, default=1,
+                   help="skip groups with fewer than this many samples in either CSV")
+    args = p.parse_args()
+
+    base = load(args.baseline_csv, args.metric)
+    cand = load(args.candidate_csv, args.metric)
+
+    # For time: lower is better, so speedup = base / cand.
+    # For tflops/gbps: higher is better, so speedup = cand / base.
+    lower_is_better = (args.metric == "time_s")
+
+    rows = []
+    for key in sorted(base.keys() | cand.keys()):
+        b = summarize(base.get(key, []))
+        c = summarize(cand.get(key, []))
+        if b is None or c is None:
+            rows.append({"key": key, "status": "baseline-only" if c is None else "candidate-only",
+                         "b": b, "c": c, "speedup": None})
+            continue
+        if b[2] < args.min_samples or c[2] < args.min_samples:
+            continue
+        b_med, c_med = b[0], c[0]
+        if b_med <= 0 or c_med <= 0:
+            speedup = None
+        else:
+            speedup = (b_med / c_med) if lower_is_better else (c_med / b_med)
+        rows.append({"key": key, "status": "matched", "b": b, "c": c, "speedup": speedup})
+
+    matched = [r for r in rows if r["status"] == "matched" and r["speedup"] is not None]
+    only_b = [r for r in rows if r["status"] == "baseline-only"]
+    only_c = [r for r in rows if r["status"] == "candidate-only"]
+
+    if args.sort == "speedup":
+        matched.sort(key=lambda r: r["speedup"])
+    elif args.sort == "abs_change":
+        matched.sort(key=lambda r: -abs(r["speedup"] - 1.0))
+    if args.top is not None:
+        matched = matched[:args.top]
+
+    unit = "ms" if args.metric == "time_s" else args.metric
+    scale = 1e3 if args.metric == "time_s" else 1.0
+    print(f"{'suite':<22} {'class':<22} {'method':<22} {'params':<48} "
+          f"{'base ' + unit:>12} {'cand ' + unit:>12} {'speedup':>9}  n_b/n_c")
+    print("-" * 160)
+    for r in matched:
+        s, cls, m, params = r["key"]
+        b_med, c_med = r["b"][0] * scale, r["c"][0] * scale
+        n_b, n_c = r["b"][2], r["c"][2]
+        print(f"{s:<22} {cls:<22} {m:<22} {params:<48} "
+              f"{b_med:>12.4f} {c_med:>12.4f} {r['speedup']:>8.3f}x  {n_b}/{n_c}")
+
+    if matched:
+        speedups = [r["speedup"] for r in matched]
+        print()
+        print(f"{len(matched)} matched groups: "
+              f"median {statistics.median(speedups):.3f}x  "
+              f"min {min(speedups):.3f}x  "
+              f"max {max(speedups):.3f}x")
+
+    for label, rows_list in [("baseline only", only_b), ("candidate only", only_c)]:
+        if not rows_list:
+            continue
+        print(f"\n{len(rows_list)} groups {label}:")
+        for r in rows_list:
+            print("  " + " | ".join(r["key"]))
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/microbench/driver.py b/benchmarks/microbench/driver.py
new file mode 100644
index 000000000..62d28324e
--- /dev/null
+++ b/benchmarks/microbench/driver.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Microbenchmark driver — runs Bench* classes via torch.utils.benchmark.
+
+Each bench file declares one or more ``Bench*`` classes with ``params``,
+``param_names``, and ``time_*`` methods (optionally paired with ``work_*``
+companions returning ``{"flops": ...}`` or ``{"bytes": ...}``). The driver
+runs them and writes long-format CSV (one row per Timer block).
+
+Usage:
+    python driver.py <suite> [method_filter] [--csv FILE | --no-csv]
+    python driver.py --all   [method_filter] [--csv FILE | --no-csv]
+    python bench_gemm.py     [method_filter] [--csv FILE | --no-csv]
+"""
+
+import argparse
+import csv
+import glob
+import importlib
+import itertools
+import os
+import platform
+import subprocess
+import sys
+import time
+
+import torch.utils.benchmark as benchmark
+
+
+def time_func(fn, min_run_time=1.0):
+    """Time *fn* with torch.utils.benchmark.Timer (blocked_autorange)."""
+    return benchmark.Timer(stmt="fn()", globals={"fn": fn}).blocked_autorange(
+        min_run_time=min_run_time)
+
+
+def _commit():
+    try:
+        return subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.DEVNULL
+        ).decode().strip()
+    except Exception:
+        return "unknown"
+
+
+CSV_COLUMNS = [
+    "suite", "class", "method", "params", "sample_idx", "time_s",
+    "number_per_run", "tflops", "gbps", "commit", "machine", "started_at_ms",
+]
+
+
+def _default_csv_path(script_dir):
+    repo_root = os.path.abspath(os.path.join(script_dir, "..", ".."))
+    return os.path.join(repo_root, "benchmarks", ".bench-results",
+                        platform.node() or "unknown", f"{_commit()}.csv")
+
+
+def run_class(suite, cls, class_name, method_filter, commit, machine):
+    """Run all time_* methods of *cls* over the param cross-product."""
+    methods = sorted(m for m in dir(cls) if m.startswith("time_")
+                     and (not method_filter or method_filter in m))
+    if not methods:
+        return []
+
+    param_names = getattr(cls, "param_names", [])
+    combos = list(itertools.product(*getattr(cls, "params", [[]])))
+
+    print(f"\n{class_name}  ({len(combos)} combos x {len(methods)} methods)")
+    hdr = (f"  {'median':>10}  {'mean':>10}  {'iqr':>10}  "
+           f"{'TFLOPS':>8}  {'GB/s':>8}  {'method':<28}  params")
+    print("-" * len(hdr)); print(hdr); print("-" * len(hdr))
+
+    def fmt(val):
+        return f"{val:>8.1f}" if val else f"{'':>8}"
+
+    rows = []
+    for method_name in methods:
+        started_at_ms = int(time.time() * 1000)
+        for combo in combos:
+            label = ", ".join(f"{n}={v}" for n, v in zip(param_names, combo))
+            params_str = ";".join(f"{n}={v}" for n, v in zip(param_names, combo))
+            inst = cls()
+            try:
+                inst.setup(*combo)
+                m = getattr(inst, method_name)(*combo)
+            except Exception as e:
+                print(f"  SKIP  {label}  {method_name}: {e}")
+                continue
+
+            wfn = getattr(inst, "work_" + method_name[5:], None)
+            work = wfn(*combo) if wfn else {}
+            flops, byts = work.get("flops"), work.get("bytes")
+
+            for i, t in enumerate(m.times):
+                rows.append({
+                    "suite": suite, "class": class_name, "method": method_name,
+                    "params": params_str, "sample_idx": i, "time_s": t,
+                    "number_per_run": m.number_per_run,
+                    "tflops": flops / t / 1e12 if flops and t > 0 else "",
+                    "gbps": byts / t / 1e9 if byts and t > 0 else "",
+                    "commit": commit, "machine": machine,
+                    "started_at_ms": started_at_ms,
+                })
+
+            tflops = flops / m.median / 1e12 if flops and m.median > 0 else None
+            gbps = byts / m.median / 1e9 if byts and m.median > 0 else None
+            print(f"  {m.median*1000:>8.3f}ms  {m.mean*1000:>8.3f}ms  "
+                  f"{m.iqr*1000:>8.3f}ms  {fmt(tflops)}  {fmt(gbps)}  "
+                  f"{method_name:<28}  {label}")
+    return rows
+
+
+def main(caller_file=None):
+    parser = argparse.ArgumentParser(
+        description="Run microbenchmarks via torch.utils.benchmark.")
+    if caller_file is None:
+        parser.add_argument("suite", nargs="?",
+                            help="bench module (e.g. bench_gemm)")
+        parser.add_argument("--all", action="store_true",
+                            help="run all bench_*.py in this directory")
+    parser.add_argument("method_filter", nargs="?", default=None,
+                        help="only run time_* methods containing this string")
+    parser.add_argument("--csv", default=None, metavar="FILE",
+                        help="output CSV path "
+                             "(default: benchmarks/.bench-results/<machine>/<commit>.csv)")
+    parser.add_argument("--no-csv", action="store_true",
+                        help="don't write CSV (stdout summary only)")
+    args = parser.parse_args()
+
+    if caller_file is not None:
+        script_dir = os.path.dirname(os.path.abspath(caller_file))
+        suites = [os.path.splitext(os.path.basename(caller_file))[0]]
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        if args.all:
+            suites = sorted(os.path.splitext(os.path.basename(f))[0]
+                            for f in glob.glob(os.path.join(script_dir, "bench_*.py")))
+        elif args.suite:
+            suites = [args.suite]
+        else:
+            parser.error("provide a suite name or use --all")
+
+    os.chdir(script_dir)
+    if script_dir not in sys.path:
+        sys.path.insert(0, script_dir)
+
+    commit, machine = _commit(), platform.node() or "unknown"
+    rows = []
+    for suite in suites:
+        mod = importlib.import_module(suite)
+        for name in sorted(dir(mod)):
+            obj = getattr(mod, name)
+            if isinstance(obj, type) and name.startswith("Bench"):
+                rows.extend(run_class(suite, obj, name, args.method_filter,
+                                      commit, machine))
+
+    if rows and not args.no_csv:
+        path = args.csv or _default_csv_path(script_dir)
+        os.makedirs(os.path.dirname(os.path.abspath(path)) or ".", exist_ok=True)
+        with open(path, "w", newline="") as f:
+            w = csv.DictWriter(f, fieldnames=CSV_COLUMNS, extrasaction="ignore")
+            w.writeheader()
+            w.writerows(rows)
+        print(f"\nWrote {len(rows)} rows to {path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/microbench/shapes.py b/benchmarks/microbench/shapes.py
new file mode 100644
index 000000000..c4172aff9
--- /dev/null
+++ b/benchmarks/microbench/shapes.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+###############################################################################
+# Copyright (c) 2026, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+###############################################################################
+"""Shared model configs and shape helpers for microbenchmarks.
+
+Each helper returns a fresh dict — callers may mutate it (add custom entries,
+drop unwanted ones) without affecting other benchmarks. Callers that want a
+different set of base models can pass ``models=`` to override the default.
+
+Sources:
+  https://huggingface.co/meta-llama/Llama-3.1-8B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-70B/blob/main/config.json
+  https://huggingface.co/meta-llama/Llama-3.1-405B/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/config.json
+  https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/blob/main/config.json
+  MoE configs: https://github.com/AMD-AGI/Primus-Turbo/blob/main/benchmark/ops/config.py
+"""
+
+# Default token-count (batch * seq_len) sweep used by most benches.
+M_SIZES = [1024, 2048, 4096, 8192]
+
+# Dense transformer configs: (hidden, intermediate, num_q_heads, num_kv_heads, head_dim, tp)
+DENSE_MODELS = {
+    "Llama3-8B_TP1":   (4096, 14336, 32, 8, 128, 1),
+    "Llama3-8B_TP8":   (4096, 14336, 32, 8, 128, 8),
+    "Llama3-70B_TP8":  (8192, 28672, 64, 8, 128, 8),
+    "Llama3-405B_TP8": (16384, 53248, 128, 8, 128, 8),
+    "Qwen2.5-7B_TP1":  (3584, 18944, 28, 4, 128, 1),
+    "Qwen2.5-72B_TP8": (8192, 29568, 64, 8, 128, 8),
+}
+
+# MoE configs: (n_routed_experts, moe_intermediate_size, hidden_size)
+MOE_MODELS = {
+    "DSV2-Lite": (64, 1408, 2048),
+    "DSV2":      (160, 1536, 5120),
+    "DSV3":      (256, 2048, 7168),
+    "Grok-V2":   (8, 16384, 8192),
+}
+
+# Default expert-parallel sweep for grouped GEMM.
+EP_SIZES = [32, 16, 8]
+
+
+def gemm_shapes(models=None):
+    """Per-projection ``(N, K)`` shapes derived from dense transformer configs.
+
+    Returns ``{"<model>-QKV": (N, K), "<model>-AttnOut": ..., "-GateUp", "-Down"}``.
+    """
+    shapes = {}
+    for name, (h, inter, nq, nkv, hd, tp) in (models or DENSE_MODELS).items():
+        shapes[f"{name}-QKV"]     = ((nq * hd + 2 * nkv * hd) // tp, h)
+        shapes[f"{name}-AttnOut"] = (h, (nq * hd) // tp)
+        shapes[f"{name}-GateUp"]  = ((2 * inter) // tp, h)
+        shapes[f"{name}-Down"]    = (h, inter // tp)
+    return shapes
+
+
+def attention_configs(models=None):
+    """Per-model attention configs as ``(num_q_heads, num_kv_heads, head_dim, tp)``."""
+    return {
+        name: (nq, nkv, hd, tp)
+        for name, (_h, _i, nq, nkv, hd, tp) in (models or DENSE_MODELS).items()
+    }
+
+
+def hidden_sizes(models=None):
+    """Per-architecture ``{name: hidden}`` mapping, deduplicated across TP variants."""
+    out = {}
+    for name, (h, *_) in (models or DENSE_MODELS).items():
+        base = name.split("_TP")[0]
+        out.setdefault(base, h)
+    return out
+
+
+def grouped_gemm_configs(models=None, ep_sizes=None):
+    """Grouped GEMM ``(B, N, K)`` configs for MoE GateUp + Down projections.
+
+    ``B = n_routed_experts // ep`` for each EP that divides ``n_routed_experts``;
+    other EPs are silently skipped.
+    """
+    configs = {}
+    for name, (n_experts, inter, hidden) in (models or MOE_MODELS).items():
+        for ep in (ep_sizes or EP_SIZES):
+            if n_experts % ep != 0:
+                continue
+            B = n_experts // ep
+            configs[f"{name}_EP{ep}-GateUp"] = (B, 2 * inter, hidden)
+            configs[f"{name}_EP{ep}-Down"]   = (B, hidden, inter)
+    return configs