From 5e6533d563ebc68df4d11ef617235c90620460f9 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Wed, 27 May 2026 12:20:58 -0700
Subject: [PATCH 01/14] feat(power): extend measured-power aggregation to
 multinode srt-slurm runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Builds on PR #1558 (single-node measured-power) for multinode benchmarks
via srt-slurm. Pipeline:

  srt-slurm perfmon (per-node nvidia-smi sampling — PR #35 on
    NVIDIA/srt-slurm, layered on SemiAnalysisAI/srt-slurm:feat/inferencex-perfmon)
   perf_samples_<host>.csv in outputs/<job>/logs/ on shared NFS
   launch_gb300-cw.sh exports GPU_METRICS_CSV_GLOB to $GITHUB_ENV
   process_result.py expands the glob and hands the list to
   aggregate_power.run()
   aggregate_power.py namespaces local GPU indices per source CSV stem so
   each node's local indices 0..N-1 don't collide across nodes; emits
   cluster-wide avg_power_w + joules_per_*_token
   InferenceX-app ETL auto-captures the numeric fields (no schema change)

Changes:

- utils/aggregate_power.py: widen csv_path to Path | Iterable[Path] keeping
  the original param name. Per-source GPU-id namespacing only kicks in when
  there are 2+ sources so single-node num_gpus is unchanged. CLI adds
  --csv-glob (Python-side glob, mutually exclusive with --csv).
- utils/process_result.py: bridge GPU_METRICS_CSV_GLOB env var. Glob takes
  precedence over single GPU_METRICS_CSV when both are set.
- runners/launch_gb300-cw.sh: point dynamo-sglang at our srt-slurm fork,
  append `monitoring:` block to each recipe post-copy (idempotent), and
  write GPU_METRICS_CSV_GLOB to $GITHUB_ENV after the job for the
  downstream Process result step.
- 8 new multinode tests in test_aggregate_power.py (per-source namespacing,
  sub-second clock drift, asymmetric prefill/decode power, missing-CSV
  silent skip, backward-compat single-path-in-list, Iterable acceptance,
  E2E run with list). 3 new in test_process_result.py (glob aggregation,
  precedence over single CSV, empty-match falls through). 64/64 pass.

Verified data-format end-to-end on gb300 hardware: nvidia-smi
inside the sglang container emits the columns aggregate_power.py needs
timestamp, gpu, power_w.
---
 runners/launch_gb300-cw.sh    |  41 +++++-
 utils/aggregate_power.py      | 225 ++++++++++++++++++++++----------
 utils/process_result.py       |  43 +++++--
 utils/test_aggregate_power.py | 236 +++++++++++++++++++++++++++++++++-
 utils/test_process_result.py  | 105 +++++++++++++++
 5 files changed, 565 insertions(+), 85 deletions(-)
diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 25e7f4db5..9f3222dad 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -12,8 +12,13 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then
     export MODEL_PATH="/mnt/vast/models/dsv4"
 
     if [[ $FRAMEWORK == "dynamo-sglang" ]]; then
-        SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git"
-        SRT_SLURM_RECIPES_REF="main"
+        # Pinned to our SemiAnalysisAI fork of NVIDIA/srt-slurm to pick up
+        # PR #35 (per-node nvidia-smi monitoring during the benchmark sweep)
+        # ahead of its upstream merge. The branch tracks PR #35's head SHA:
+        # to bump, re-fetch refs/pull/35/head from NVIDIA/srt-slurm and force-
+        # push to SemiAnalysisAI/srt-slurm:feat/inferencex-perfmon.
+        SRT_SLURM_RECIPES_REPO="https://github.com/SemiAnalysisAI/srt-slurm.git"
+        SRT_SLURM_RECIPES_REF="feat/inferencex-perfmon"
         SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4"
         SRT_RECIPE_DST="recipes/sglang/deepseek-v4"
     elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then
@@ -106,6 +111,19 @@ git checkout "$SRT_SLURM_RECIPES_REF"
 mkdir -p "$SRT_RECIPE_DST"
 cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST"
 
+# Enable per-node GPU perfmon (PR #35) on every overlaid recipe. `monitoring`
+# is a top-level SrtConfig field and defaults to None, so without this the
+# orchestrator's _start_perf_monitor short-circuits and no perf_samples_*.csv
+# are ever written — multinode measured-power aggregation would silently
+# skip. Idempotent: skips recipes that already declare `monitoring:`.
+for recipe in "$SRT_RECIPE_DST"/*.yaml; do
+    [ -f "$recipe" ] || continue
+    if ! grep -q '^monitoring:' "$recipe"; then
+        printf '\nmonitoring:\n  enabled: true\n  sample_interval: 1.0\n' >> "$recipe"
+        echo "[perfmon] enabled monitoring in recipe: $recipe"
+    fi
+done
+
 echo "Installing srtctl..."
 # CRITICAL — uv install location.
 # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is
@@ -279,6 +297,25 @@ else
     echo "Warning: Logs directory not found at $LOGS_DIR"
 fi
 
+# Hand the per-node perfmon CSVs off to the downstream "Process result" step
+# in benchmark-multinode-tmpl.yml. srt-slurm's perfmon (PR #35) writes
+# perf_samples_{node}.csv straight into $LOGS_DIR on the host. process_result.py
+# already invokes aggregate_power.run() inline; teaching it to read
+# GPU_METRICS_CSV_GLOB lets utils/aggregate_power.py do the multi-CSV
+# aggregation (each agg JSON gets avg_power_w / joules_per_*_token patched in
+# place). Use an absolute glob because process_result.py runs from
+# $GITHUB_WORKSPACE, not from this srt-slurm checkout.
+if [ -d "$LOGS_DIR" ]; then
+    perf_glob_dir="$(pwd)/$LOGS_DIR"
+    perf_csv_count=$(ls "$perf_glob_dir"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ')
+    if [ "$perf_csv_count" -gt 0 ]; then
+        echo "[perfmon] Found $perf_csv_count per-node perf_samples_*.csv under $perf_glob_dir/"
+        echo "GPU_METRICS_CSV_GLOB=$perf_glob_dir/perf_samples_*.csv" >> "$GITHUB_ENV"
+    else
+        echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv found in $perf_glob_dir — measured power aggregation will be skipped"
+    fi
+fi
+
 if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
     if [ ! -d "$LOGS_DIR" ]; then
         exit 1
diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index 3c204085a..ab6fcef3e 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -1,12 +1,19 @@
 """Aggregate measured GPU power from a vendor SMI CSV into the agg result JSON.
 
-Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi),
-filters samples to the benchmark load window using start/end Unix timestamps
-written by benchmark_serving.py, and patches two keys into the aggregated
-result JSON consumed by InferenceX-app's ETL:
+Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi)
+or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark
+load window using start/end Unix timestamps written by benchmark_serving.py, and
+patches three keys into the aggregated result JSON consumed by InferenceX-app's
+ETL:
 
   - avg_power_w:               mean per-GPU power draw (W) during the load window
   - joules_per_output_token:   (avg_power_w * num_gpus * duration_s) / total_output_tokens
+  - joules_per_total_token:    same, divided by (input + output) tokens
+
+Multinode: accepts multiple CSV paths (one per worker node). GPU indices are
+namespaced by source CSV stem to avoid the same-index collision across nodes —
+e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4
+total GPUs instead of 32.
 
 The ETL (`packages/db/src/etl/benchmark-mapper.ts`) auto-captures any numeric
 field in the agg JSON into the `metrics` JSONB column, so no schema migration
@@ -14,8 +21,8 @@
 
 Vendor schema detection is regex-based: any timestamp-like column + any column
 whose name contains "power" (excluding "limit"/"cap"/"max") is picked up.
-NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version. Both are
-handled.
+NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's
+perfmon emits "power_w". All are handled.
 
 This script is best-effort. Missing or malformed CSV exits 0 without patching
 so a monitoring hiccup never breaks the benchmark upload.
@@ -25,9 +32,11 @@
 
 import argparse
 import csv
+import glob as glob_module
 import json
 import re
 import sys
+from collections.abc import Iterable
 from datetime import datetime, timezone
 from pathlib import Path
 from statistics import mean
@@ -109,74 +118,84 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No
 
 
 def aggregate_power(
-    csv_path: Path,
+    csv_path: Path | Iterable[Path],
     start_unix: float,
     end_unix: float,
 ) -> tuple[float, int] | None:
     """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end].
 
-    Returns None if the CSV is missing, empty, has no detectable power column,
-    or no rows fall in the window.
+    Accepts either a single Path (single-node case) or an iterable of Paths
+    (multinode case: one CSV per worker node, all written by srt-slurm's
+    perfmon). For multi-path inputs, GPU indices are namespaced by source
+    CSV stem so the distinct-id count reflects the true total — each node
+    independently reports indices 0..N, and without namespacing the union
+    would collapse to a single node's worth.
+
+    Returns None if no CSVs are usable, none have a detectable power column,
+    or no rows fall in the window across all paths.
     """
-    if not csv_path.is_file() or csv_path.stat().st_size == 0:
-        return None
-    if end_unix <= start_unix:
+    paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
+    if not paths or end_unix <= start_unix:
         return None
 
-    try:
-        with csv_path.open("r", newline="", encoding="utf-8", errors="replace") as f:
-            reader = csv.DictReader(f, skipinitialspace=True)
-            header = [c.strip() for c in (reader.fieldnames or [])]
-            reader.fieldnames = header
-            timestamp_col, power_col, gpu_col = _detect_columns(header)
-            if not timestamp_col or not power_col:
-                return None
-
-            # Group power readings by sample timestamp so per-sample total power
-            # (sum across GPUs) is computed correctly even if rows are interleaved.
-            #
-            # per_sample_row_count is the structural divisor: it's incremented for
-            # every contributing row regardless of whether a GPU-index column was
-            # detected. per_sample_gpus / gpu_keys are only populated when gpu_col
-            # is present and provide the canonical num_gpus via distinct-id count.
-            # When gpu_col is absent (vendor schema variant whose header doesn't
-            # match _GPU_INDEX_COL_RE), we fall back to inferring num_gpus from
-            # the modal row count per timestamp — assuming one row per GPU per
-            # sample, which is what every SMI tool we've seen actually emits.
-            per_sample_total: dict[float, float] = {}
-            per_sample_row_count: dict[float, int] = {}
-            per_sample_gpus: dict[float, set[str]] = {}
-            gpu_keys: set[str] = set()
-
-            for row in reader:
-                ts_raw = (row.get(timestamp_col) or "").strip()
-                pw_raw = (row.get(power_col) or "").strip()
-                ts = _parse_timestamp(ts_raw)
-                pw = _parse_power(pw_raw)
-                if ts is None or pw is None:
-                    continue
-                if ts < start_unix or ts > end_unix:
+    # Only namespace when there are multiple sources — keeps single-node
+    # gpu_keys identical to the pre-multinode behavior so existing callers
+    # see the same num_gpus values.
+    namespace = len(paths) > 1
+
+    # Per-sample state accumulates across ALL paths. Bucketed by ms-rounded
+    # timestamp so nodes whose clocks drift sub-ms still end up in the same
+    # bucket (they reliably do — all sample on `time.sleep(interval)` against
+    # the same NTP-synced cluster clock).
+    per_sample_total: dict[float, float] = {}
+    per_sample_row_count: dict[float, int] = {}
+    per_sample_gpus: dict[float, set[str]] = {}
+    gpu_keys: set[str] = set()
+    saw_gpu_col = False
+
+    for path in paths:
+        if not path.is_file() or path.stat().st_size == 0:
+            continue
+        try:
+            with path.open("r", newline="", encoding="utf-8", errors="replace") as f:
+                reader = csv.DictReader(f, skipinitialspace=True)
+                header = [c.strip() for c in (reader.fieldnames or [])]
+                reader.fieldnames = header
+                timestamp_col, power_col, gpu_col = _detect_columns(header)
+                if not timestamp_col or not power_col:
                     continue
-                # Bucket by sample timestamp (rounded to ms to absorb sub-ms drift).
-                bucket = round(ts, 3)
-                per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw
-                per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1
                 if gpu_col:
-                    gpu_id = (row.get(gpu_col) or "").strip()
-                    if gpu_id:
-                        per_sample_gpus.setdefault(bucket, set()).add(gpu_id)
-                        gpu_keys.add(gpu_id)
-    except (OSError, csv.Error):
-        return None
+                    saw_gpu_col = True
+
+                for row in reader:
+                    ts_raw = (row.get(timestamp_col) or "").strip()
+                    pw_raw = (row.get(power_col) or "").strip()
+                    ts = _parse_timestamp(ts_raw)
+                    pw = _parse_power(pw_raw)
+                    if ts is None or pw is None:
+                        continue
+                    if ts < start_unix or ts > end_unix:
+                        continue
+                    bucket = round(ts, 3)
+                    per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw
+                    per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1
+                    if gpu_col:
+                        gpu_id = (row.get(gpu_col) or "").strip()
+                        if gpu_id:
+                            ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id
+                            per_sample_gpus.setdefault(bucket, set()).add(ns_id)
+                            gpu_keys.add(ns_id)
+        except (OSError, csv.Error):
+            continue
 
     if not per_sample_total:
         return None
 
     # Per-sample divisor and overall num_gpus.
-    # - If a GPU column was detected, trust distinct GPU IDs (correct for any
-    #   sampling pattern, including hot-swap or partial visibility).
-    # - Otherwise, infer from row count (one row per GPU per sample).
-    if gpu_col and gpu_keys:
+    # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs.
+    # - Otherwise, infer from row count (one row per GPU per sample, summed
+    #   across all paths' rows that fell into the same timestamp bucket).
+    if saw_gpu_col and gpu_keys:
         num_gpus = len(gpu_keys)
         per_sample_mean_per_gpu = [
             total / max(len(per_sample_gpus.get(ts, ())), 1)
@@ -194,7 +213,16 @@ def _load_bench_window(
     bench_result_path: Path,
 ) -> tuple[float, float, float, int, int] | None:
     """Read (start_unix, end_unix, duration_s, total_output_tokens, total_input_tokens)
-    from the raw bench JSON. Returns None if any required field is missing.
+    from the raw bench JSON. Returns None if a window cannot be resolved.
+
+    Window resolution order, tried in turn:
+      1. benchmark_start_time_unix + benchmark_end_time_unix (our benchmark_serving.py
+         writes both — single-node, brackets the actual load window exactly).
+      2. date + duration (srt-slurm sa-bench writes "YYYYMMDD-HHMMSS" UTC as the
+         result write time — multinode; treat as bench end and subtract duration
+         for start. Overshoots by post-bench JSON serialization, typically <5s).
+      3. file mtime + duration (last resort if `date` is absent or unparseable —
+         same end-of-bench proxy as #2 via the result file's mtime).
 
     total_input_tokens defaults to 0 if absent (older bench JSONs may not have it);
     this only degrades joules_per_total_token to equal joules_per_output_token in
@@ -204,18 +232,52 @@ def _load_bench_window(
         bench = json.loads(bench_result_path.read_text(encoding="utf-8"))
     except (OSError, json.JSONDecodeError):
         return None
-    start = bench.get("benchmark_start_time_unix")
-    end = bench.get("benchmark_end_time_unix")
     duration = bench.get("duration")
     total_output = bench.get("total_output_tokens")
     total_input = bench.get("total_input_tokens", 0)
-    if not all(isinstance(v, (int, float)) for v in (start, end, duration)):
+    if not isinstance(duration, (int, float)):
         return None
     if not isinstance(total_output, int) or total_output <= 0:
         return None
     if not isinstance(total_input, int) or total_input < 0:
         total_input = 0
-    return float(start), float(end), float(duration), int(total_output), int(total_input)
+
+    # Tier 1: explicit Unix timestamps (single-node bench_serving.py).
+    start = bench.get("benchmark_start_time_unix")
+    end = bench.get("benchmark_end_time_unix")
+    if isinstance(start, (int, float)) and isinstance(end, (int, float)):
+        return float(start), float(end), float(duration), int(total_output), int(total_input)
+
+    # Tier 2: parse `date` field (srt-slurm sa-bench multinode). On observed
+    # runs the string matches file mtime to the second, confirming it's the
+    # JSON write time.
+    date_str = bench.get("date")
+    if isinstance(date_str, str):
+        try:
+            end_dt = datetime.strptime(date_str, "%Y%m%d-%H%M%S").replace(tzinfo=timezone.utc)
+            end_unix = end_dt.timestamp()
+            return (
+                float(end_unix - duration),
+                float(end_unix),
+                float(duration),
+                int(total_output),
+                int(total_input),
+            )
+        except ValueError:
+            pass
+
+    # Tier 3: file mtime as last-resort bench-end proxy.
+    try:
+        end_unix = bench_result_path.stat().st_mtime
+    except OSError:
+        return None
+    return (
+        float(end_unix - duration),
+        float(end_unix),
+        float(duration),
+        int(total_output),
+        int(total_input),
+    )
 
 
 def patch_agg_result(
@@ -234,7 +296,7 @@ def patch_agg_result(
     tmp_path.replace(agg_path)
 
 
-def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int:
+def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -> int:
     window = _load_bench_window(bench_result)
     if window is None:
         print(
@@ -244,10 +306,12 @@ def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int:
         return 0
     start, end, duration, total_output, total_input = window
 
-    result = aggregate_power(csv_path, start, end)
+    paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
+    result = aggregate_power(paths, start, end)
     if result is None:
+        label = str(paths[0]) if len(paths) == 1 else f"{len(paths)} CSVs"
         print(
-            f"[aggregate_power] No usable power samples in {csv_path} for "
+            f"[aggregate_power] No usable power samples in {label} for "
             f"window [{start}, {end}] — skipping",
             file=sys.stderr,
         )
@@ -291,11 +355,20 @@ def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int:
 
 def main() -> int:
     parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
-    parser.add_argument(
+    source = parser.add_mutually_exclusive_group()
+    source.add_argument(
         "--csv",
         type=Path,
-        default=Path("/workspace/gpu_metrics.csv"),
-        help="Path to gpu_metrics.csv from start_gpu_monitor (default: /workspace/gpu_metrics.csv)",
+        default=None,
+        help="Single gpu_metrics.csv from start_gpu_monitor (single-node). "
+        "Falls back to /workspace/gpu_metrics.csv when neither --csv nor --csv-glob is set.",
+    )
+    source.add_argument(
+        "--csv-glob",
+        type=str,
+        default=None,
+        help="Shell glob expanding to per-node perf_samples_*.csv files (multinode, "
+        "written by srt-slurm's perfmon). GPU indices are namespaced by source CSV stem.",
     )
     parser.add_argument(
         "--bench-result",
@@ -310,7 +383,17 @@ def main() -> int:
         help="Path to the agg_<run>.json output of process_result.py (will be patched in place)",
     )
     args = parser.parse_args()
-    return run(args.csv, args.bench_result, args.agg_result)
+
+    if args.csv_glob:
+        paths = sorted(Path(p) for p in glob_module.glob(args.csv_glob))
+        if not paths:
+            print(
+                f"[aggregate_power] No CSVs matched glob {args.csv_glob!r} — skipping",
+                file=sys.stderr,
+            )
+            return 0
+        return run(paths, args.bench_result, args.agg_result)
+    return run(args.csv or Path("/workspace/gpu_metrics.csv"), args.bench_result, args.agg_result)
 
 
 if __name__ == "__main__":
diff --git a/utils/process_result.py b/utils/process_result.py
index 5fb059473..0510fe023 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -139,20 +139,41 @@ def get_required_env_vars(required_vars):
 
 # Best-effort: patch measured power into the agg JSON. Never fails the run.
 try:
+    import glob as _glob_module
     from aggregate_power import run as _aggregate_power_run
 
-    _csv_candidates = [
-        os.environ.get('GPU_METRICS_CSV'),
-        'gpu_metrics.csv',
-        '/workspace/gpu_metrics.csv',
-    ]
-    _csv_path = next(
-        (Path(p) for p in _csv_candidates if p and Path(p).is_file()),
-        None,
-    )
-    if _csv_path is not None:
+    # Multinode path: srt-slurm launchers set GPU_METRICS_CSV_GLOB after the job
+    # to a shell glob expanding to one perf_samples_<node>.csv per worker node.
+    # Takes precedence over the single-CSV fallback — if the launcher set the
+    # glob, the run was multinode and there is no single-CSV fallback to make.
+    _csv_arg = None
+    _glob_pattern = os.environ.get('GPU_METRICS_CSV_GLOB')
+    if _glob_pattern:
+        _matched = sorted(Path(p) for p in _glob_module.glob(_glob_pattern))
+        if _matched:
+            _csv_arg = _matched
+        else:
+            print(
+                f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files',
+                file=sys.stderr,
+            )
+
+    if _csv_arg is None:
+        # Single-node path: gpu_metrics.csv written by start_gpu_monitor in the
+        # bench container.
+        _csv_candidates = [
+            os.environ.get('GPU_METRICS_CSV'),
+            'gpu_metrics.csv',
+            '/workspace/gpu_metrics.csv',
+        ]
+        _csv_arg = next(
+            (Path(p) for p in _csv_candidates if p and Path(p).is_file()),
+            None,
+        )
+
+    if _csv_arg is not None:
         _aggregate_power_run(
-            csv_path=_csv_path,
+            csv_path=_csv_arg,
             bench_result=Path(f'{result_filename}.json'),
             agg_result=agg_path,
         )
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index bf81ee7b1..b6f040ce8 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -15,7 +15,7 @@
 
 import json
 import sys
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 
 import pytest
@@ -445,3 +445,237 @@ def test_patch_agg_result_is_atomic_via_tempfile(tmp_path: Path):
     assert data["joules_per_total_token"] == 0.5
     # No .tmp leftover.
     assert not (tmp_path / "agg.json.tmp").exists()
+
+
+# --------------------------------------------------------------------------- #
+# Multi-node CSV aggregation
+# --------------------------------------------------------------------------- #
+
+
+def test_aggregate_power_multi_node_namespaces_local_gpu_indices(tmp_path: Path):
+    """Two per-node CSVs each report local GPU indices 0..3.
+
+    Without per-source namespacing the union of gpu_keys would collapse to 4
+    instead of 8 — the bug this whole multinode change exists to prevent."""
+    base = 1_700_000_000.0
+    node1 = tmp_path / "perf_samples_node1.csv"
+    node2 = tmp_path / "perf_samples_node2.csv"
+    _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+    _write_nvidia_csv(node2, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    result = aggregate_power([node1, node2], base, base + 10)
+    assert result is not None
+    avg_power, num_gpus = result
+    assert avg_power == pytest.approx(500.0)
+    assert num_gpus == 8
+
+
+def test_aggregate_power_multi_node_with_sub_second_clock_drift(tmp_path: Path):
+    """Per-node polls drift sub-second even on NTP-synced clusters.
+
+    Node1 polls at base+s, node2 at base+s+0.3 — rows land in different ms
+    buckets. Each bucket is then a single-node 4-GPU slice averaging to 500W,
+    and the mean across all buckets is the cluster per-GPU mean."""
+    base = 1_700_000_000.0
+    node1 = tmp_path / "perf_samples_node1.csv"
+    node2 = tmp_path / "perf_samples_node2.csv"
+    _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+    _write_nvidia_csv(node2, [(base + s + 0.3, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    result = aggregate_power([node1, node2], base, base + 10)
+    assert result is not None
+    avg_power, num_gpus = result
+    assert avg_power == pytest.approx(500.0)
+    assert num_gpus == 8
+
+
+def test_aggregate_power_multi_node_asymmetric_prefill_decode_power(tmp_path: Path):
+    """Disagg topologies draw different per-GPU power on prefill vs decode nodes.
+
+    4 prefill GPUs at 600W + 4 decode GPUs at 400W: cluster mean is the
+    weighted average across all 8 GPUs = (4*600 + 4*400)/8 = 500W."""
+    base = 1_700_000_000.0
+    prefill = tmp_path / "perf_samples_prefill0.csv"
+    decode = tmp_path / "perf_samples_decode0.csv"
+    _write_nvidia_csv(prefill, [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)])
+    _write_nvidia_csv(decode, [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)])
+
+    result = aggregate_power([prefill, decode], base, base + 10)
+    assert result is not None
+    avg_power, num_gpus = result
+    assert avg_power == pytest.approx(500.0)
+    assert num_gpus == 8
+
+
+def test_aggregate_power_multi_node_skips_missing_csv_silently(tmp_path: Path):
+    """If a node failed to start perfmon, its CSV will be absent.
+
+    Aggregating over the remaining nodes is preferable to returning None —
+    losing one node's power data should not zero out the whole metric."""
+    base = 1_700_000_000.0
+    present = tmp_path / "perf_samples_node1.csv"
+    missing = tmp_path / "perf_samples_node2.csv"  # never written
+    _write_nvidia_csv(present, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    result = aggregate_power([present, missing], base, base + 10)
+    assert result is not None
+    avg_power, num_gpus = result
+    assert avg_power == pytest.approx(500.0)
+    assert num_gpus == 4  # only the node that emitted data
+
+
+def test_aggregate_power_single_path_in_list_matches_bare_path(tmp_path: Path):
+    """Backward compat: aggregate_power([csv], ...) == aggregate_power(csv, ...).
+
+    Single-source behavior must not change when the caller wraps the path in a
+    list — otherwise process_result.py-style callers that defensively normalize
+    to a list would see different num_gpus values than legacy bare-path calls."""
+    base = 1_700_000_000.0
+    csv = tmp_path / "gpu_metrics.csv"
+    _write_nvidia_csv(csv, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(8)])
+
+    bare = aggregate_power(csv, base, base + 10)
+    listed = aggregate_power([csv], base, base + 10)
+    assert bare == listed
+    assert bare == (pytest.approx(500.0), 8)
+
+
+def test_aggregate_power_accepts_iterable_not_just_list(tmp_path: Path):
+    """Signature is Iterable[Path] — generators (e.g. Path.glob()) must work."""
+    base = 1_700_000_000.0
+    node1 = tmp_path / "perf_samples_node1.csv"
+    node2 = tmp_path / "perf_samples_node2.csv"
+    _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(2) for gpu in range(4)])
+    _write_nvidia_csv(node2, [(base + s, gpu, 500.0) for s in range(2) for gpu in range(4)])
+
+    result = aggregate_power(tmp_path.glob("perf_samples_*.csv"), base, base + 10)
+    assert result is not None
+    _, num_gpus = result
+    assert num_gpus == 8
+
+
+def test_run_multi_node_e2e_computes_joules_from_total_gpus(tmp_path: Path):
+    """End-to-end multinode: run() with a list of CSVs patches the agg JSON.
+
+    8 GPUs total at 500W for 10s → 40_000 J → 2.0 J/output_token for 20_000 tokens."""
+    base = 1_700_000_000.0
+    node1 = tmp_path / "perf_samples_node1.csv"
+    node2 = tmp_path / "perf_samples_node2.csv"
+    _write_nvidia_csv(node1, [(base + 1 + s, gpu, 500.0) for s in range(2) for gpu in range(4)])
+    _write_nvidia_csv(node2, [(base + 1 + s, gpu, 500.0) for s in range(2) for gpu in range(4)])
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000)
+    agg.write_text(json.dumps({"hw": "gb300", "conc": 8192}), encoding="utf-8")
+
+    exit_code = run([node1, node2], bench, agg)
+    assert exit_code == 0
+
+    patched = json.loads(agg.read_text())
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+    assert patched["joules_per_output_token"] == pytest.approx(2.0)
+
+
+def test_run_multi_node_skips_when_all_csvs_missing(tmp_path: Path):
+    """Entire monitoring failure (all per-node CSVs absent) skips cleanly without patching."""
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(bench, start=0.0, end=10.0, duration=10.0, total_output=1000)
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    exit_code = run([tmp_path / "absent1.csv", tmp_path / "absent2.csv"], bench, agg)
+    assert exit_code == 0
+
+    patched = json.loads(agg.read_text())
+    assert "avg_power_w" not in patched
+
+
+# --------------------------------------------------------------------------- #
+# _load_bench_window fallbacks for srt-slurm multinode result JSONs
+#
+# srt-slurm's sa-bench result writer emits `date` + `duration` but NOT the
+# benchmark_*_time_unix fields our single-node benchmark_serving.py adds.
+# Without a fallback, multinode runs would always hit "No bench window in
+# {bench_result}" and silently skip power aggregation end-to-end.
+# --------------------------------------------------------------------------- #
+
+
+def test_run_uses_date_field_when_unix_timestamps_absent(tmp_path: Path):
+    """Tier 2: parse `date` ("YYYYMMDD-HHMMSS" UTC) + `duration` for the window."""
+    # End of bench at a known UTC instant; CSV samples land in [end-10, end].
+    end_unix = datetime(2026, 5, 20, 3, 10, 29, tzinfo=timezone.utc).timestamp()
+    csv = tmp_path / "perf_samples_node0.csv"
+    _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    bench = tmp_path / "bench.json"
+    bench.write_text(
+        json.dumps(
+            {
+                "date": "20260520-031029",
+                "duration": 10.0,
+                "total_output_tokens": 1000,
+                "total_input_tokens": 8000,
+            }
+        ),
+        encoding="utf-8",
+    )
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run([csv], bench, agg) == 0
+    patched = json.loads(agg.read_text())
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+    # 4 GPUs × 500W × 10s = 20_000 J / 1000 output tokens = 20.0 J/output_token.
+    assert patched["joules_per_output_token"] == pytest.approx(20.0)
+    # 20_000 J / (1000 + 8000) total tokens ≈ 2.222 J/total_token.
+    assert patched["joules_per_total_token"] == pytest.approx(20_000 / 9_000)
+
+
+def test_run_uses_mtime_when_date_unparseable(tmp_path: Path):
+    """Tier 3a: malformed `date` falls through to file mtime as bench-end proxy."""
+    csv = tmp_path / "perf_samples_node0.csv"
+    bench = tmp_path / "bench.json"
+    bench.write_text(
+        json.dumps({"date": "not-a-date", "duration": 10.0, "total_output_tokens": 1000}),
+        encoding="utf-8",
+    )
+    # CSV samples bracket bench file's mtime so they fall inside the derived window.
+    end_unix = bench.stat().st_mtime
+    _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+    assert run([csv], bench, agg) == 0
+    patched = json.loads(agg.read_text())
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+
+
+def test_run_uses_mtime_when_no_date_field(tmp_path: Path):
+    """Tier 3b: bench JSON has only `duration` → file mtime is end-of-bench."""
+    csv = tmp_path / "perf_samples_node0.csv"
+    bench = tmp_path / "bench.json"
+    bench.write_text(
+        json.dumps({"duration": 10.0, "total_output_tokens": 1000}),
+        encoding="utf-8",
+    )
+    end_unix = bench.stat().st_mtime
+    _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+    assert run([csv], bench, agg) == 0
+    patched = json.loads(agg.read_text())
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+
+
+def test_run_skips_when_duration_missing(tmp_path: Path):
+    """No tier can resolve a window without `duration` — skip cleanly."""
+    csv = tmp_path / "perf_samples_node0.csv"
+    _write_nvidia_csv(csv, [(1_700_000_000.0, 0, 400.0)])
+    bench = tmp_path / "bench.json"
+    bench.write_text(json.dumps({"total_output_tokens": 1000}), encoding="utf-8")
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run([csv], bench, agg) == 0
+    assert "avg_power_w" not in json.loads(agg.read_text())
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 4037689ea..61d3b45fc 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -649,3 +649,108 @@ def test_missing_bench_timestamps_does_not_patch(self, tmp_path, single_node_env
         patched = json.loads(agg_path.read_text())
         assert "avg_power_w" not in patched
         assert "joules_per_output_token" not in patched
+
+    def test_multinode_csv_glob_aggregates_across_per_node_csvs(self, tmp_path, single_node_env_vars):
+        """Multinode wiring: srt-slurm launchers set GPU_METRICS_CSV_GLOB to a
+        shell glob expanding to one perf_samples_<node>.csv per worker node.
+        process_result.py must expand it and hand the list to the aggregator,
+        which namespaces local GPU indices per source so they don't collide.
+
+        Without this bridge the launcher would set the env var, process_result.py
+        would ignore it (fall back to a non-existent /workspace/gpu_metrics.csv),
+        and the chart would silently show no power data — the failure mode that
+        motivated catching this in the contract check."""
+        start, end = 1_700_000_100.0, 1_700_000_160.0  # 60s bench window
+        # Two per-node CSVs at the same local indices 0-3. Without per-source
+        # namespacing the union would collapse to 4 GPUs instead of 8.
+        self._write_nvidia_csv(
+            tmp_path / "perf_samples_node1.csv", start, end, watts_per_gpu=600.0, num_gpus=4
+        )
+        self._write_nvidia_csv(
+            tmp_path / "perf_samples_node2.csv", start, end, watts_per_gpu=600.0, num_gpus=4
+        )
+
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 64,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 500.0,
+            "benchmark_start_time_unix": start,
+            "benchmark_end_time_unix": end,
+            "duration": 60.0,
+            "total_output_tokens": 30_000,
+        }
+        env = {
+            **single_node_env_vars,
+            "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"),
+        }
+
+        result = run_script(tmp_path, env, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        agg_path = tmp_path / "agg_benchmark_result.json"
+        patched = json.loads(agg_path.read_text())
+        # 2 nodes × 4 GPUs = 8 total. Per-GPU mean stays at 600W.
+        assert patched["avg_power_w"] == pytest.approx(600.0, abs=0.5)
+        # 600W × 8 GPUs × 60s / 30_000 tokens = 9.6 J/tok.
+        # If namespacing failed we'd see ~4.8 (only 4 GPUs counted).
+        assert patched["joules_per_output_token"] == pytest.approx(9.6, abs=0.05)
+
+    def test_multinode_csv_glob_takes_precedence_over_single_csv(self, tmp_path, single_node_env_vars):
+        """If both GLOB and single CSV are set, the glob wins.
+
+        Reflects the ownership split: the multinode launcher sets the glob
+        after the job, while the single CSV env var is only meaningful for
+        single-node runs. If a stale single-CSV value leaks through (e.g. a
+        runner with persistent env), the glob should still take precedence."""
+        start, end = 1_700_000_100.0, 1_700_000_160.0
+        glob_csv = tmp_path / "perf_samples_node1.csv"
+        stale_csv = tmp_path / "stale_single.csv"
+        self._write_nvidia_csv(glob_csv, start, end, watts_per_gpu=600.0, num_gpus=4)
+        self._write_nvidia_csv(stale_csv, start, end, watts_per_gpu=100.0, num_gpus=1)
+
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 64,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 500.0,
+            "benchmark_start_time_unix": start,
+            "benchmark_end_time_unix": end,
+            "duration": 60.0,
+            "total_output_tokens": 30_000,
+        }
+        env = {
+            **single_node_env_vars,
+            "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"),
+            "GPU_METRICS_CSV": str(stale_csv),
+        }
+
+        result = run_script(tmp_path, env, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        agg_path = tmp_path / "agg_benchmark_result.json"
+        patched = json.loads(agg_path.read_text())
+        # Glob respected → 600W (4 GPUs). Stale fallback would give 100W (1 GPU).
+        assert patched["avg_power_w"] == pytest.approx(600.0, abs=0.5)
+
+    def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, single_node_env_vars):
+        """If GPU_METRICS_CSV_GLOB is set but matches no files (perfmon failed
+        to start on any node), process_result.py still succeeds and writes the
+        agg JSON without power fields. The run must not block on telemetry."""
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 64,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 500.0,
+        }
+        env = {
+            **single_node_env_vars,
+            "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"),
+        }
+
+        result = run_script(tmp_path, env, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        agg_path = tmp_path / "agg_benchmark_result.json"
+        patched = json.loads(agg_path.read_text())
+        assert "avg_power_w" not in patched

From a9339df821d8865f57b737be84f34f7ab768faea Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Wed, 27 May 2026 12:27:53 -0700
Subject: [PATCH 02/14] chore(perf-changelog): trigger multinode sweep for
 measured-power aggregation

Appends entry for dsv4-fp4-gb300-dynamo-sglang so run-sweep.yml fires when
the sweep-enabled label is added to PR #1574. The sweep produces the first
multinode agg JSONs with avg_power_w + joules_per_*_token, validating the
per-source GPU-id namespacing and GPU_METRICS_CSV_GLOB env-var bridge
end-to-end on real GB300 hardware (gb300-cw cluster).
---
 perf-changelog.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index b7182a39c..01c7d2e81 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3192,3 +3192,9 @@
     - "Add GLM-5-FP8 models.yaml flags, setup_deps.sh (aiter gluon + transformers glm_moe_dsa), GLM-5 env tuning in env.sh"
     - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572
+
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574

From eb2fa8e6c0f527f985013f001bab01005d646c22 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 10:39:52 -0700
Subject: [PATCH 03/14] fix(launcher): recurse subdirectories when injecting
 monitoring: into recipes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous glob `$SRT_RECIPE_DST/*.yaml` only matched top-level YAMLs,
but recipes live under workload subdirectories (e.g. 8k1k/*.yaml). The
loop iterated zero times, no recipe got the monitoring: block, perfmon
never spawned, no perf_samples_*.csv were written, aggregate_power
silently skipped patching the agg JSON, and the dashboard had no power
data.

Sweep #26548110246 burned hours of GB300 time and shipped "success" with
zero power keys in every agg artifact — exactly the silent-failure chain
we should have caught earlier.

Fix: recurse via `find -type f -name '*.yaml'`. Add a loud WARNING when
zero recipes get the injection so future regressions surface immediately
instead of waiting for missing dashboard data to be noticed.
---
 runners/launch_gb300-cw.sh | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 9f3222dad..951c350de 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -116,13 +116,24 @@ cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST"
 # orchestrator's _start_perf_monitor short-circuits and no perf_samples_*.csv
 # are ever written — multinode measured-power aggregation would silently
 # skip. Idempotent: skips recipes that already declare `monitoring:`.
-for recipe in "$SRT_RECIPE_DST"/*.yaml; do
-    [ -f "$recipe" ] || continue
+#
+# CRITICAL: use `find` recursively, not a flat `*.yaml` glob. Recipes live
+# in $SRT_RECIPE_DST/<workload>/*.yaml (e.g. .../8k1k/*.yaml) — a flat glob
+# matches zero files, the loop runs zero times, no recipe gets monitoring,
+# and perfmon never spawns. PR #1574's first real sweep (#26548110246) hit
+# exactly this: completed "success" with no power data because the glob
+# matched nothing and the failure was silent end-to-end.
+INJECTED_COUNT=0
+while IFS= read -r recipe; do
     if ! grep -q '^monitoring:' "$recipe"; then
         printf '\nmonitoring:\n  enabled: true\n  sample_interval: 1.0\n' >> "$recipe"
         echo "[perfmon] enabled monitoring in recipe: $recipe"
+        INJECTED_COUNT=$((INJECTED_COUNT + 1))
     fi
-done
+done < <(find "$SRT_RECIPE_DST" -type f -name '*.yaml')
+if [ "$INJECTED_COUNT" -eq 0 ]; then
+    echo "[perfmon] WARNING: zero recipes received monitoring injection under $SRT_RECIPE_DST. Either every recipe already had it, or the directory layout changed — power data will be MISSING from this run." >&2
+fi
 
 echo "Installing srtctl..."
 # CRITICAL — uv install location.

From ddd71f3dddff9e51053972d7ed694fc27aecce46 Mon Sep 17 00:00:00 2001
From: Aryan <aryaman@semianalysis.com>
Date: Thu, 28 May 2026 10:40:48 -0700
Subject: [PATCH 04/14] ci: re-trigger sweep after srt-slurm fork rebase

Previous Run Sweep failed because SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon
was based on PR #35's head from 2026-04-13, predating the default_bash_preamble
schema field that the launcher writes into srtslurm.yaml. srtctl rejected the
config with 'Unknown field' and the job never submitted.

Fork branch has now been rebased onto current NVIDIA/srt-slurm main (which
has default_bash_preamble), with PR #35 perfmon commits + Aryan's role-label
follow-up squashed/cherry-picked on top. Empty commit here re-fires the
Run Sweep workflow so we can validate end-to-end on real DSv4 FP4 GB300.

From 317049d9c0242219fed56ac651b413a274128000 Mon Sep 17 00:00:00 2001
From: Aryan <aryaman@semianalysis.com>
Date: Thu, 28 May 2026 10:44:34 -0700
Subject: [PATCH 05/14] chore(perf-changelog): re-trigger sweep after launcher
 recurse-glob fix

Prior sweep (#26548110246) on SHA 8d303414 completed green but produced
zero power data because of a flat-glob bug in the monitoring-injection
loop. Fix is on HEAD (6da2f1b6) but the workflow's path filter only
fires on perf-changelog.yaml edits, so this commit re-touches that file
to re-dispatch.
---
 perf-changelog.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 01c7d2e81..9fdae2fd6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3197,4 +3197,5 @@
     - dsv4-fp4-gb300-dynamo-sglang
   description:
     - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
+    - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574

From 06558b9c82e3dfa78c230c79e61bddda3b8b1d18 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 11:07:15 -0700
Subject: [PATCH 06/14] feat(power): per-worker power + per-stage J/token for
 disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends multinode measured-power aggregation with per-worker breakdown
and per-stage joules attribution. The cluster-wide avg_power_w +
joules_per_*_token fields stay backward-compatible; new disagg-only
fields layer on top.

New agg JSON fields:
  - power_by_worker: list of {role, worker_idx, hosts, num_gpus,
    avg_power_w} parsed from srt-slurm perfmon CSV filenames
    (`perf_samples_<role>_w<idx>_<host>.csv`). Roles: prefill, decode,
    agg, frontend. Workers spanning N nodes collapse one entry whose
    num_gpus is the cross-node sum.
  - joules_per_input_token: prefill_energy / total_input_tokens
    (disagg only — meaningless without a prefill stage).

Per-stage attribution (disagg only) replaces cluster-wide ratios for
existing fields:
  - joules_per_output_token = decode_energy / output_tokens
  - joules_per_total_token = (prefill + decode) / all_tokens
Frontend-labeled CSVs are excluded from per-stage energy but still
listed for observability. Falls back to cluster-wide math if only one
stage's CSVs survived.

process_result.py now passes DISAGG through to aggregate_power.run().
launch_gb300-cw.sh's recipe-injection loop reports found/injected
counts so a zero-recipes-found bug is distinguishable from the
benign all-already-monitored case.

Tests: 88/88 pass (68 existing + 20 new). New coverage: label parsing
across host formats, multi-node-per-worker collapse, per-stage J/token
math, frontend exclusion, single-stage fallback, zero-input degenerate,
end-to-end disagg wiring through process_result.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 runners/launch_gb300-cw.sh    |  13 +-
 utils/aggregate_power.py      | 406 +++++++++++++++++++++++++++-------
 utils/process_result.py       |  19 +-
 utils/test_aggregate_power.py | 366 ++++++++++++++++++++++++++++++
 utils/test_process_result.py  |  91 ++++++++
 5 files changed, 803 insertions(+), 92 deletions(-)

diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh
index 951c350de..eddb17b29 100644
--- a/runners/launch_gb300-cw.sh
+++ b/runners/launch_gb300-cw.sh
@@ -123,16 +123,25 @@ cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST"
 # and perfmon never spawns. PR #1574's first real sweep (#26548110246) hit
 # exactly this: completed "success" with no power data because the glob
 # matched nothing and the failure was silent end-to-end.
+FOUND_COUNT=0
 INJECTED_COUNT=0
 while IFS= read -r recipe; do
+    FOUND_COUNT=$((FOUND_COUNT + 1))
     if ! grep -q '^monitoring:' "$recipe"; then
         printf '\nmonitoring:\n  enabled: true\n  sample_interval: 1.0\n' >> "$recipe"
         echo "[perfmon] enabled monitoring in recipe: $recipe"
         INJECTED_COUNT=$((INJECTED_COUNT + 1))
     fi
 done < <(find "$SRT_RECIPE_DST" -type f -name '*.yaml')
-if [ "$INJECTED_COUNT" -eq 0 ]; then
-    echo "[perfmon] WARNING: zero recipes received monitoring injection under $SRT_RECIPE_DST. Either every recipe already had it, or the directory layout changed — power data will be MISSING from this run." >&2
+# Distinguish "found 0 recipes" (real bug — directory wrong/empty) from "all
+# already had monitoring:" (benign — happens on reruns or if a recipe author
+# pre-declared the block). Only the former is a missing-power-data risk.
+if [ "$FOUND_COUNT" -eq 0 ]; then
+    echo "[perfmon] WARNING: zero recipe YAMLs found under $SRT_RECIPE_DST. The directory layout may have changed — power data will be MISSING from this run." >&2
+elif [ "$INJECTED_COUNT" -eq 0 ]; then
+    echo "[perfmon] all $FOUND_COUNT recipes already declared monitoring: — no injection needed."
+else
+    echo "[perfmon] injected monitoring: into $INJECTED_COUNT of $FOUND_COUNT recipes."
 fi
 
 echo "Installing srtctl..."
diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index ab6fcef3e..962c9167d 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -3,22 +3,38 @@
 Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi)
 or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark
 load window using start/end Unix timestamps written by benchmark_serving.py, and
-patches three keys into the aggregated result JSON consumed by InferenceX-app's
-ETL:
+patches the aggregated result JSON with cluster-wide and per-worker power data
+consumed by InferenceX-app's ETL.
 
+Cluster-wide fields (always written when any power data exists):
   - avg_power_w:               mean per-GPU power draw (W) during the load window
-  - joules_per_output_token:   (avg_power_w * num_gpus * duration_s) / total_output_tokens
-  - joules_per_total_token:    same, divided by (input + output) tokens
+  - joules_per_output_token:   energy / total_output_tokens
+  - joules_per_total_token:    energy / (input + output) tokens
+
+For disaggregated multinode runs (DISAGG=true), the numerator for the J/token
+metrics shifts to a per-stage attribution: prefill workers' energy is divided
+by input tokens, decode workers' energy by output tokens. Per-stage power is
+where the meaningful efficiency signal lives — total-energy ratios mostly just
+re-scale the same number by different denominators.
+
+  - joules_per_input_token:    prefill_energy / total_input_tokens (disagg only)
+  - joules_per_output_token:   decode_energy / total_output_tokens   (overridden)
+  - joules_per_total_token:    (prefill_energy + decode_energy) / total_tokens (overridden)
+
+Per-worker breakdown (multinode only — single-node has no role concept):
+  - power_by_worker: list of {role, worker_idx, hosts[], num_gpus, avg_power_w}
+                     where role is "prefill", "decode", "agg", or "frontend".
+
+srt-slurm encodes the worker role and index in the perfmon CSV filename:
+`perf_samples_<role>_w<worker_idx>_<host>.csv` — see srt-slurm fork's
+benchmark_stage._start_perf_monitor. Filenames that don't match this pattern
+(e.g. single-node `gpu_metrics.csv`) fall back to a single cluster-wide bucket.
 
 Multinode: accepts multiple CSV paths (one per worker node). GPU indices are
 namespaced by source CSV stem to avoid the same-index collision across nodes —
 e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4
 total GPUs instead of 32.
 
-The ETL (`packages/db/src/etl/benchmark-mapper.ts`) auto-captures any numeric
-field in the agg JSON into the `metrics` JSONB column, so no schema migration
-is required.
-
 Vendor schema detection is regex-based: any timestamp-like column + any column
 whose name contains "power" (excluding "limit"/"cap"/"max") is picked up.
 NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's
@@ -48,6 +64,14 @@
 _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE)
 _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?")
 
+# srt-slurm perfmon filename: perf_samples_<role>_w<worker_idx>_<host>.csv
+# Roles: prefill, decode, agg, frontend (see srt-slurm benchmark_stage._label).
+# Host may contain hyphens and digits; greedy `.+` is fine because the `_w<idx>_`
+# anchor is unambiguous.
+_PERFMON_LABEL_RE = re.compile(
+    r"^perf_samples_(?P<role>prefill|decode|agg|frontend)_w(?P<idx>\d+)_(?P<host>.+)$"
+)
+
 
 def _parse_timestamp(value: str) -> float | None:
     """Best-effort timestamp parse to Unix epoch seconds (local wall clock).
@@ -117,85 +141,90 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No
     return timestamp_col, power_col, gpu_col
 
 
-def aggregate_power(
-    csv_path: Path | Iterable[Path],
-    start_unix: float,
-    end_unix: float,
-) -> tuple[float, int] | None:
-    """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end].
+def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None:
+    """Extract (role, worker_idx, host) from a srt-slurm perfmon CSV filename.
 
-    Accepts either a single Path (single-node case) or an iterable of Paths
-    (multinode case: one CSV per worker node, all written by srt-slurm's
-    perfmon). For multi-path inputs, GPU indices are namespaced by source
-    CSV stem so the distinct-id count reflects the true total — each node
-    independently reports indices 0..N, and without namespacing the union
-    would collapse to a single node's worth.
+    Returns None for filenames not matching the perfmon pattern (e.g.
+    single-node `gpu_metrics.csv`). Used to group node-level CSVs by the
+    worker(s) running on each node.
+    """
+    m = _PERFMON_LABEL_RE.match(path.stem)
+    if not m:
+        return None
+    return m.group("role"), int(m.group("idx")), m.group("host")
 
-    Returns None if no CSVs are usable, none have a detectable power column,
-    or no rows fall in the window across all paths.
+
+def _read_samples(
+    path: Path, start_unix: float, end_unix: float
+) -> tuple[list[tuple[float, float, str | None]], bool] | None:
+    """Read one CSV → list of (timestamp_bucket, power_w, gpu_id) in window.
+
+    Returns (rows, saw_gpu_col) on success, None if the file is unreadable /
+    missing the required columns. Empty rows list is valid (file readable but
+    no samples landed in the window).
     """
-    paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
-    if not paths or end_unix <= start_unix:
+    if not path.is_file() or path.stat().st_size == 0:
         return None
+    try:
+        with path.open("r", newline="", encoding="utf-8", errors="replace") as f:
+            reader = csv.DictReader(f, skipinitialspace=True)
+            header = [c.strip() for c in (reader.fieldnames or [])]
+            reader.fieldnames = header
+            timestamp_col, power_col, gpu_col = _detect_columns(header)
+            if not timestamp_col or not power_col:
+                return None
+            rows: list[tuple[float, float, str | None]] = []
+            for row in reader:
+                ts = _parse_timestamp((row.get(timestamp_col) or "").strip())
+                pw = _parse_power((row.get(power_col) or "").strip())
+                if ts is None or pw is None:
+                    continue
+                if ts < start_unix or ts > end_unix:
+                    continue
+                gpu_id = (row.get(gpu_col) or "").strip() if gpu_col else None
+                rows.append((round(ts, 3), pw, gpu_id or None))
+            return rows, gpu_col is not None
+    except (OSError, csv.Error):
+        return None
+
 
-    # Only namespace when there are multiple sources — keeps single-node
-    # gpu_keys identical to the pre-multinode behavior so existing callers
-    # see the same num_gpus values.
-    namespace = len(paths) > 1
+def _aggregate_rows(
+    sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]],
+    *,
+    namespace: bool,
+) -> tuple[float, int] | None:
+    """Merge rows across CSVs into (per_gpu_avg_power_w, num_gpus).
 
-    # Per-sample state accumulates across ALL paths. Bucketed by ms-rounded
-    # timestamp so nodes whose clocks drift sub-ms still end up in the same
-    # bucket (they reliably do — all sample on `time.sleep(interval)` against
-    # the same NTP-synced cluster clock).
+    `sources` is a list of (path, rows, saw_gpu_col) for the CSVs to roll up
+    together. Rows are bucketed by ms-rounded timestamp so nodes with sub-ms
+    clock drift land in the same bucket. GPU indices are namespaced by the
+    source path's stem when `namespace=True` (multi-source case) to keep
+    same-local-index across nodes from collapsing.
+    """
     per_sample_total: dict[float, float] = {}
     per_sample_row_count: dict[float, int] = {}
     per_sample_gpus: dict[float, set[str]] = {}
     gpu_keys: set[str] = set()
-    saw_gpu_col = False
-
-    for path in paths:
-        if not path.is_file() or path.stat().st_size == 0:
-            continue
-        try:
-            with path.open("r", newline="", encoding="utf-8", errors="replace") as f:
-                reader = csv.DictReader(f, skipinitialspace=True)
-                header = [c.strip() for c in (reader.fieldnames or [])]
-                reader.fieldnames = header
-                timestamp_col, power_col, gpu_col = _detect_columns(header)
-                if not timestamp_col or not power_col:
-                    continue
-                if gpu_col:
-                    saw_gpu_col = True
-
-                for row in reader:
-                    ts_raw = (row.get(timestamp_col) or "").strip()
-                    pw_raw = (row.get(power_col) or "").strip()
-                    ts = _parse_timestamp(ts_raw)
-                    pw = _parse_power(pw_raw)
-                    if ts is None or pw is None:
-                        continue
-                    if ts < start_unix or ts > end_unix:
-                        continue
-                    bucket = round(ts, 3)
-                    per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw
-                    per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1
-                    if gpu_col:
-                        gpu_id = (row.get(gpu_col) or "").strip()
-                        if gpu_id:
-                            ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id
-                            per_sample_gpus.setdefault(bucket, set()).add(ns_id)
-                            gpu_keys.add(ns_id)
-        except (OSError, csv.Error):
-            continue
+    saw_gpu_col_any = False
+
+    for path, rows, saw_gpu_col in sources:
+        if saw_gpu_col:
+            saw_gpu_col_any = True
+        for bucket, pw, gpu_id in rows:
+            per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw
+            per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1
+            if gpu_id is not None:
+                ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id
+                per_sample_gpus.setdefault(bucket, set()).add(ns_id)
+                gpu_keys.add(ns_id)
 
     if not per_sample_total:
         return None
 
-    # Per-sample divisor and overall num_gpus.
     # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs.
     # - Otherwise, infer from row count (one row per GPU per sample, summed
     #   across all paths' rows that fell into the same timestamp bucket).
-    if saw_gpu_col and gpu_keys:
+    if saw_gpu_col_any and gpu_keys:
         num_gpus = len(gpu_keys)
         per_sample_mean_per_gpu = [
             total / max(len(per_sample_gpus.get(ts, ())), 1)
@@ -209,6 +238,109 @@ def aggregate_power(
     return mean(per_sample_mean_per_gpu), num_gpus
 
 
+def aggregate_power(
+    csv_path: Path | Iterable[Path],
+    start_unix: float,
+    end_unix: float,
+) -> tuple[float, int] | None:
+    """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end].
+
+    Accepts either a single Path (single-node case) or an iterable of Paths
+    (multinode case: one CSV per worker node, all written by srt-slurm's
+    perfmon). For multi-path inputs, GPU indices are namespaced by source
+    CSV stem so the distinct-id count reflects the true total — each node
+    independently reports indices 0..N, and without namespacing the union
+    would collapse to a single node's worth.
+
+    Returns None if no CSVs are usable, none have a detectable power column,
+    or no rows fall in the window across all paths.
+    """
+    paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
+    if not paths or end_unix <= start_unix:
+        return None
+
+    sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = []
+    for path in paths:
+        read = _read_samples(path, start_unix, end_unix)
+        if read is None:
+            continue
+        rows, saw_gpu_col = read
+        sources.append((path, rows, saw_gpu_col))
+    if not sources:
+        return None
+
+    return _aggregate_rows(sources, namespace=len(paths) > 1)
+
+
+def aggregate_power_by_worker(
+    csv_paths: Iterable[Path],
+    start_unix: float,
+    end_unix: float,
+) -> list[dict] | None:
+    """Group CSVs by (role, worker_idx) and return per-worker power rollups.
+
+    Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w}.
+    Returns None if no CSVs have parseable filenames OR no labeled CSV yields
+    usable samples. Unlabeled CSVs in the input are silently skipped — they
+    can't be attributed to a worker.
+
+    Hosts are listed because a single worker can span multiple nodes (e.g.
+    a 16-GPU decode worker over 4 nodes, all labeled decode_w0_<host>).
+    Multiple node-CSVs sharing the same (role, worker_idx) collapse into one
+    worker entry whose num_gpus is the sum across nodes.
+    """
+    paths = list(csv_paths)
+    if not paths or end_unix <= start_unix:
+        return None
+
+    # Group paths by (role, worker_idx); discard unlabeled.
+    by_worker: dict[tuple[str, int], list[Path]] = {}
+    hosts_by_worker: dict[tuple[str, int], set[str]] = {}
+    for p in paths:
+        label = _parse_perfmon_label(p)
+        if label is None:
+            continue
+        role, worker_idx, host = label
+        key = (role, worker_idx)
+        by_worker.setdefault(key, []).append(p)
+        hosts_by_worker.setdefault(key, set()).add(host)
+    if not by_worker:
+        return None
+
+    out: list[dict] = []
+    for (role, worker_idx), worker_paths in by_worker.items():
+        sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = []
+        for path in worker_paths:
+            read = _read_samples(path, start_unix, end_unix)
+            if read is None:
+                continue
+            rows, saw_gpu_col = read
+            sources.append((path, rows, saw_gpu_col))
+        if not sources:
+            continue
+        # Namespace across paths within a worker too — a 16-GPU decode worker
+        # spans 4 nodes, each reporting local indices 0..3.
+        result = _aggregate_rows(sources, namespace=len(sources) > 1)
+        if result is None:
+            continue
+        avg_power_w, num_gpus = result
+        out.append(
+            {
+                "role": role,
+                "worker_idx": worker_idx,
+                "hosts": sorted(hosts_by_worker[(role, worker_idx)]),
+                "num_gpus": num_gpus,
+                "avg_power_w": round(avg_power_w, 3),
+            }
+        )
+    if not out:
+        return None
+    # Stable order: role (prefill < decode < agg < frontend), then worker_idx.
+    role_order = {"prefill": 0, "decode": 1, "agg": 2, "frontend": 3}
+    out.sort(key=lambda w: (role_order.get(w["role"], 99), w["worker_idx"]))
+    return out
+
+
 def _load_bench_window(
     bench_result_path: Path,
 ) -> tuple[float, float, float, int, int] | None:
@@ -285,18 +417,63 @@ def patch_agg_result(
     avg_power_w: float,
     joules_per_output_token: float,
     joules_per_total_token: float,
+    joules_per_input_token: float | None = None,
+    power_by_worker: list[dict] | None = None,
 ) -> None:
-    """Read the agg JSON, add the three power keys, and write it back atomically."""
+    """Read the agg JSON, add the power keys, and write it back atomically.
+
+    `joules_per_input_token` and `power_by_worker` are optional — omitted from
+    the JSON when None (kept that way so single-node and non-disagg multinode
+    agg JSONs don't gain meaningless null fields).
+    """
     data = json.loads(agg_path.read_text(encoding="utf-8"))
     data["avg_power_w"] = round(avg_power_w, 3)
     data["joules_per_output_token"] = round(joules_per_output_token, 6)
     data["joules_per_total_token"] = round(joules_per_total_token, 6)
+    if joules_per_input_token is not None:
+        data["joules_per_input_token"] = round(joules_per_input_token, 6)
+    if power_by_worker is not None:
+        data["power_by_worker"] = power_by_worker
     tmp_path = agg_path.with_suffix(agg_path.suffix + ".tmp")
     tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
     tmp_path.replace(agg_path)
 
 
-def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -> int:
+def _disagg_stage_energies(
+    power_by_worker: list[dict], duration: float
+) -> tuple[float, float] | None:
+    """Sum per-worker energy for prefill vs decode workers (J).
+
+    Returns (prefill_energy_j, decode_energy_j) or None if either stage is
+    absent — without both stages we can't do per-stage attribution and the
+    caller should fall back to total-energy math.
+    """
+    prefill_e = 0.0
+    decode_e = 0.0
+    has_prefill = False
+    has_decode = False
+    for w in power_by_worker:
+        e = w["avg_power_w"] * w["num_gpus"] * duration
+        if w["role"] == "prefill":
+            prefill_e += e
+            has_prefill = True
+        elif w["role"] == "decode":
+            decode_e += e
+            has_decode = True
+        # "frontend" / "agg" / unknown roles deliberately excluded — they
+        # don't belong to either stage's per-token cost.
+    if not (has_prefill and has_decode):
+        return None
+    return prefill_e, decode_e
+
+
+def run(
+    csv_path: Path | Iterable[Path],
+    bench_result: Path,
+    agg_result: Path,
+    *,
+    disagg: bool = False,
+) -> int:
     window = _load_bench_window(bench_result)
     if window is None:
         print(
@@ -318,15 +495,51 @@ def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -
         return 0
     avg_power_w, num_gpus = result
 
-    # Joules consumed by the system during the bench window, divided by either
-    # output tokens (for generation-cost metrics) or all tokens (for whole-
-    # workload efficiency).
+    # Per-worker rollup is best-effort: only emitted when CSV filenames carry
+    # the perfmon role/index encoding. Single-node `gpu_metrics.csv` won't
+    # parse, so aggregate_power_by_worker returns None and the field is omitted.
+    power_by_worker = aggregate_power_by_worker(paths, start, end)
+
+    # Cluster-wide energy baseline. Used as the fallback numerator when
+    # per-stage attribution isn't available.
     total_system_energy_j = avg_power_w * num_gpus * duration
-    joules_per_output_token = total_system_energy_j / total_output
     total_tokens = total_output + total_input
-    joules_per_total_token = (
-        total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
-    )
+
+    joules_per_input_token: float | None = None
+
+    if disagg and power_by_worker is not None:
+        stage = _disagg_stage_energies(power_by_worker, duration)
+        if stage is not None:
+            prefill_energy_j, decode_energy_j = stage
+            # Per-stage attribution: prefill workers process input tokens,
+            # decode workers process output tokens. Strictly more accurate
+            # than total-energy ratios when prefill/decode have different
+            # per-GPU power profiles (typical: prefill is compute-bound and
+            # draws more than memory-bound decode).
+            joules_per_output_token = decode_energy_j / total_output
+            joules_per_input_token = (
+                prefill_energy_j / total_input if total_input > 0 else None
+            )
+            joules_per_total_token = (
+                (prefill_energy_j + decode_energy_j) / total_tokens
+                if total_tokens > 0
+                else joules_per_output_token
+            )
+        else:
+            # disagg=true but workers don't split into prefill+decode (e.g.
+            # only one role's CSVs survived). Fall back to cluster math.
+            joules_per_output_token = total_system_energy_j / total_output
+            joules_per_total_token = (
+                total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
+            )
+    else:
+        # Single-node or non-disagg multinode: keep the cluster-wide ratios
+        # backward-compatible with everything that consumed the pre-disagg
+        # schema.
+        joules_per_output_token = total_system_energy_j / total_output
+        joules_per_total_token = (
+            total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
+        )
 
     if not agg_result.is_file():
         print(
@@ -337,18 +550,32 @@ def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -
 
     try:
         patch_agg_result(
-            agg_result, avg_power_w, joules_per_output_token, joules_per_total_token
+            agg_result,
+            avg_power_w,
+            joules_per_output_token,
+            joules_per_total_token,
+            joules_per_input_token=joules_per_input_token,
+            power_by_worker=power_by_worker,
         )
     except (OSError, json.JSONDecodeError) as exc:
         print(f"[aggregate_power] Failed to patch {agg_result}: {exc}", file=sys.stderr)
         return 0
 
+    worker_summary = (
+        f"workers={len(power_by_worker)}" if power_by_worker else "workers=cluster-only"
+    )
+    jpit_summary = (
+        f"joules_per_input_token={joules_per_input_token:.4f} "
+        if joules_per_input_token is not None
+        else ""
+    )
     print(
         f"[aggregate_power] avg_power_w={avg_power_w:.2f} (per GPU, n={num_gpus}) "
         f"joules_per_output_token={joules_per_output_token:.4f} "
+        f"{jpit_summary}"
         f"joules_per_total_token={joules_per_total_token:.4f} "
         f"duration={duration:.1f}s output_tokens={total_output} input_tokens={total_input} "
-        f"-> {agg_result}"
+        f"{worker_summary} -> {agg_result}"
     )
     return 0
 
@@ -382,6 +609,14 @@ def main() -> int:
         required=True,
         help="Path to the agg_<run>.json output of process_result.py (will be patched in place)",
     )
+    parser.add_argument(
+        "--disagg",
+        action="store_true",
+        help="Treat as disaggregated inference: emit joules_per_input_token using "
+        "per-stage energy attribution (prefill workers' energy / input tokens, "
+        "decode workers' energy / output tokens). Requires CSV filenames to carry "
+        "the perfmon role/index encoding.",
+    )
     args = parser.parse_args()
 
     if args.csv_glob:
@@ -392,8 +627,13 @@ def main() -> int:
                 file=sys.stderr,
             )
             return 0
-        return run(paths, args.bench_result, args.agg_result)
-    return run(args.csv or Path("/workspace/gpu_metrics.csv"), args.bench_result, args.agg_result)
+        return run(paths, args.bench_result, args.agg_result, disagg=args.disagg)
+    return run(
+        args.csv or Path("/workspace/gpu_metrics.csv"),
+        args.bench_result,
+        args.agg_result,
+        disagg=args.disagg,
+    )
 
 
 if __name__ == "__main__":
diff --git a/utils/process_result.py b/utils/process_result.py
index 0510fe023..3413d5e77 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -142,23 +142,27 @@ def get_required_env_vars(required_vars):
     import glob as _glob_module
     from aggregate_power import run as _aggregate_power_run
 
-    # Multinode path: srt-slurm launchers set GPU_METRICS_CSV_GLOB after the job
-    # to a shell glob expanding to one perf_samples_<node>.csv per worker node.
-    # Takes precedence over the single-CSV fallback — if the launcher set the
-    # glob, the run was multinode and there is no single-CSV fallback to make.
+    # Two mutually-exclusive sources, decided up front. If GPU_METRICS_CSV_GLOB
+    # is set, the run is multinode (the launcher set it deliberately) and we
+    # MUST NOT fall back to single-CSV — a stale gpu_metrics.csv left over from
+    # a previous single-node run on the same runner pod would silently publish
+    # wrong power numbers for the multinode run.
     _csv_arg = None
     _glob_pattern = os.environ.get('GPU_METRICS_CSV_GLOB')
     if _glob_pattern:
+        # Multinode path: glob to per-node perf_samples_<role>_w<idx>_<host>.csv.
         _matched = sorted(Path(p) for p in _glob_module.glob(_glob_pattern))
         if _matched:
             _csv_arg = _matched
         else:
             print(
-                f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files',
+                f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files '
+                f'— skipping power aggregation (NOT falling back to single-CSV: the launcher '
+                f'set the glob, indicating a multinode run; any single-CSV present would be '
+                f'stale single-node data)',
                 file=sys.stderr,
             )
-
-    if _csv_arg is None:
+    else:
         # Single-node path: gpu_metrics.csv written by start_gpu_monitor in the
         # bench container.
         _csv_candidates = [
@@ -176,6 +180,7 @@ def get_required_env_vars(required_vars):
             csv_path=_csv_arg,
             bench_result=Path(f'{result_filename}.json'),
             agg_result=agg_path,
+            disagg=disagg,
         )
 except Exception as exc:  # noqa: BLE001 — never block on telemetry
     print(f'[process_result] power aggregation skipped: {exc}', file=sys.stderr)
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index b6f040ce8..ed6ca69ab 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -24,9 +24,11 @@
 
 from aggregate_power import (  # noqa: E402
     _detect_columns,
+    _parse_perfmon_label,
     _parse_power,
     _parse_timestamp,
     aggregate_power,
+    aggregate_power_by_worker,
     patch_agg_result,
     run,
 )
@@ -679,3 +681,367 @@ def test_run_skips_when_duration_missing(tmp_path: Path):
 
     assert run([csv], bench, agg) == 0
     assert "avg_power_w" not in json.loads(agg.read_text())
+
+
+# --------------------------------------------------------------------------- #
+# Perfmon filename label parsing — drives per-worker grouping
+# --------------------------------------------------------------------------- #
+
+
+def test_parse_perfmon_label_prefill(tmp_path: Path):
+    role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_prefill_w0_node1.csv")
+    assert (role, idx, host) == ("prefill", 0, "node1")
+
+
+def test_parse_perfmon_label_decode_high_worker_idx(tmp_path: Path):
+    """Worker index can be multi-digit (e.g. 16-way prefill)."""
+    role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_decode_w15_node-42.csv")
+    assert (role, idx, host) == ("decode", 15, "node-42")
+
+
+def test_parse_perfmon_label_host_with_hyphens_and_digits(tmp_path: Path):
+    """CoreWeave-style hostnames like `slurm-compute-gpu-019-42b` must round-trip."""
+    role, idx, host = _parse_perfmon_label(
+        tmp_path / "perf_samples_prefill_w3_slurm-compute-gpu-019-42b.csv"
+    )
+    assert (role, idx, host) == ("prefill", 3, "slurm-compute-gpu-019-42b")
+
+
+def test_parse_perfmon_label_agg_role(tmp_path: Path):
+    """Non-disagg multinode uses role='agg' (not prefill/decode)."""
+    role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_agg_w0_node1.csv")
+    assert (role, idx, host) == ("agg", 0, "node1")
+
+
+def test_parse_perfmon_label_frontend_role(tmp_path: Path):
+    """Head-only nodes (no backend workers) get role='frontend'."""
+    role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_frontend_w0_head.csv")
+    assert (role, idx, host) == ("frontend", 0, "head")
+
+
+def test_parse_perfmon_label_unlabeled_returns_none(tmp_path: Path):
+    """Single-node `gpu_metrics.csv` doesn't match — caller should treat as None."""
+    assert _parse_perfmon_label(tmp_path / "gpu_metrics.csv") is None
+    assert _parse_perfmon_label(tmp_path / "perf_samples_node1.csv") is None
+    assert _parse_perfmon_label(tmp_path / "perf_samples_unknownrole_w0_host.csv") is None
+
+
+# --------------------------------------------------------------------------- #
+# Per-worker aggregation — groups node-CSVs by (role, worker_idx)
+# --------------------------------------------------------------------------- #
+
+
+def test_aggregate_power_by_worker_one_csv_per_worker(tmp_path: Path):
+    """4 prefill workers (one per node) + 1 decode worker on a single node.
+
+    Reflects the smallest disagg topology — every CSV is its own worker."""
+    base = 1_700_000_000.0
+    for w in range(4):
+        _write_nvidia_csv(
+            tmp_path / f"perf_samples_prefill_w{w}_pnode{w}.csv",
+            [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)],
+        )
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_decode_w0_dnode0.csv",
+        [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)],
+    )
+
+    workers = aggregate_power_by_worker(
+        list(tmp_path.glob("perf_samples_*.csv")), base, base + 10
+    )
+    assert workers is not None
+    # Ordered: prefill (w0..w3), then decode (w0).
+    assert [w["role"] for w in workers] == ["prefill"] * 4 + ["decode"]
+    assert [w["worker_idx"] for w in workers] == [0, 1, 2, 3, 0]
+    # Each worker is 4 GPUs at its respective wattage.
+    for w in workers[:4]:
+        assert w["num_gpus"] == 4
+        assert w["avg_power_w"] == pytest.approx(600.0)
+        assert len(w["hosts"]) == 1
+    assert workers[4]["num_gpus"] == 4
+    assert workers[4]["avg_power_w"] == pytest.approx(400.0)
+
+
+def test_aggregate_power_by_worker_one_worker_spans_multiple_nodes(tmp_path: Path):
+    """Decode_w0 spans 4 nodes × 4 GPUs = 16 GPUs.
+
+    Mirrors the typical wide-EP DSV4 topology (gpus_per_decode=16,
+    decode_workers=1). All 4 node-CSVs share the same (role, worker_idx)
+    and must collapse into ONE worker entry with num_gpus=16."""
+    base = 1_700_000_000.0
+    hosts = ["dnode0", "dnode1", "dnode2", "dnode3"]
+    for h in hosts:
+        _write_nvidia_csv(
+            tmp_path / f"perf_samples_decode_w0_{h}.csv",
+            [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)],
+        )
+
+    workers = aggregate_power_by_worker(
+        list(tmp_path.glob("perf_samples_*.csv")), base, base + 10
+    )
+    assert workers is not None
+    assert len(workers) == 1
+    w = workers[0]
+    assert w["role"] == "decode"
+    assert w["worker_idx"] == 0
+    assert w["num_gpus"] == 16  # 4 nodes × 4 GPUs
+    assert w["avg_power_w"] == pytest.approx(400.0)
+    assert w["hosts"] == sorted(hosts)
+
+
+def test_aggregate_power_by_worker_returns_none_when_no_labels(tmp_path: Path):
+    """Single-node `gpu_metrics.csv` has no perfmon label — returns None.
+
+    Caller (run()) then omits power_by_worker from the agg JSON entirely."""
+    base = 1_700_000_000.0
+    csv = tmp_path / "gpu_metrics.csv"
+    _write_nvidia_csv(csv, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)])
+    assert aggregate_power_by_worker([csv], base, base + 10) is None
+
+
+def test_aggregate_power_by_worker_returns_none_for_empty_input(tmp_path: Path):
+    assert aggregate_power_by_worker([], 0.0, 100.0) is None
+
+
+def test_aggregate_power_by_worker_skips_unlabeled_silently(tmp_path: Path):
+    """Mixed input: one labeled CSV + one unlabeled. Only labeled is grouped."""
+    base = 1_700_000_000.0
+    labeled = tmp_path / "perf_samples_prefill_w0_n1.csv"
+    unlabeled = tmp_path / "gpu_metrics.csv"
+    _write_nvidia_csv(labeled, [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)])
+    _write_nvidia_csv(unlabeled, [(base + s, gpu, 999.0) for s in range(3) for gpu in range(4)])
+
+    workers = aggregate_power_by_worker([labeled, unlabeled], base, base + 10)
+    assert workers is not None
+    assert len(workers) == 1
+    assert workers[0]["role"] == "prefill"
+    # Unlabeled CSV's wattage must not bleed into the prefill worker.
+    assert workers[0]["avg_power_w"] == pytest.approx(600.0)
+
+
+# --------------------------------------------------------------------------- #
+# End-to-end disagg: run(..., disagg=True) emits per-worker + per-stage J/token
+# --------------------------------------------------------------------------- #
+
+
+def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path):
+    """Full disagg pipeline: per-worker breakdown + per-stage J/input + J/output.
+
+    Topology: 2 prefill workers × 4 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W.
+    Over a 10s bench window with 8000 input + 1000 output tokens:
+      - prefill energy = 600 × 8 × 10 = 48_000 J  → J/input = 48_000 / 8000 = 6.0
+      - decode energy  = 400 × 8 × 10 = 32_000 J  → J/output = 32_000 / 1000 = 32.0
+      - total energy   = 80_000 J                  → J/total = 80_000 / 9000 ≈ 8.889
+    Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs."""
+    base = 1_700_000_000.0
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)],
+    )
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_prefill_w1_pn1.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)],
+    )
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_decode_w0_dn0.csv",
+        [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)],
+    )
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_decode_w0_dn1.csv",
+        [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)],
+    )
+
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench,
+        start=base,
+        end=base + 10,
+        duration=10.0,
+        total_output=1000,
+        total_input=8000,
+    )
+    agg.write_text(json.dumps({"hw": "gb300", "disagg": True}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+
+    # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W.
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+
+    # Per-stage J/token: prefill energy / input, decode energy / output.
+    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0
+    assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000)  # 32.0
+    assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)   # ≈ 8.889
+
+    workers = patched["power_by_worker"]
+    assert [w["role"] for w in workers] == ["prefill", "prefill", "decode"]
+    assert [w["worker_idx"] for w in workers] == [0, 1, 0]
+    # Decode_w0 collapsed across 2 hosts → 8 GPUs total.
+    decode = workers[2]
+    assert decode["num_gpus"] == 8
+    assert decode["avg_power_w"] == pytest.approx(400.0)
+    assert decode["hosts"] == ["dn0", "dn1"]
+    # Each prefill worker is one node, 4 GPUs.
+    for w in workers[:2]:
+        assert w["num_gpus"] == 4
+        assert w["avg_power_w"] == pytest.approx(600.0)
+        assert len(w["hosts"]) == 1
+
+
+def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path):
+    """A frontend-only node's power must not contribute to J/input or J/output.
+
+    Frontend nodes don't run any backend worker — their (typically near-idle)
+    GPU draw would skew per-stage attribution if counted. They still appear
+    in power_by_worker for observability."""
+    base = 1_700_000_000.0
+    # Prefill worker — 4 GPUs @ 600W → 24_000 J in 10s
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)],
+    )
+    # Decode worker — 4 GPUs @ 400W → 16_000 J
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_decode_w0_dn0.csv",
+        [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)],
+    )
+    # Frontend node — would erroneously add 4_000 J if counted.
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_frontend_w0_head.csv",
+        [(base + 1 + s, gpu, 100.0) for s in range(8) for gpu in range(4)],
+    )
+
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=8000
+    )
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+
+    # J/input = 24_000 / 8000 = 3.0 (frontend excluded).
+    assert patched["joules_per_input_token"] == pytest.approx(3.0)
+    # J/output = 16_000 / 1000 = 16.0 (frontend excluded).
+    assert patched["joules_per_output_token"] == pytest.approx(16.0)
+    # Frontend still appears in the worker list for observability.
+    roles = [w["role"] for w in patched["power_by_worker"]]
+    assert "frontend" in roles
+
+
+def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path):
+    """Non-disagg runs (single-node or multinode-agg) keep the legacy schema.
+
+    No joules_per_input_token field — it'd be meaningless without a prefill
+    stage to attribute energy to. Existing fields must keep their pre-disagg
+    semantics (total_system_energy / token_count)."""
+    base = 1_700_000_000.0
+    csv = tmp_path / "gpu_metrics.csv"
+    _write_nvidia_csv(
+        csv, [(base + 1 + s, gpu, 500.0) for s in range(8) for gpu in range(8)]
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0, total_output=20_000
+    )
+    agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8")
+
+    assert run(csv, bench, agg, disagg=False) == 0
+    patched = json.loads(agg.read_text())
+    assert "joules_per_input_token" not in patched
+    assert "power_by_worker" not in patched
+    # Legacy semantics: total energy / token count.
+    assert patched["joules_per_output_token"] == pytest.approx(2.0)
+    assert patched["joules_per_total_token"] == pytest.approx(2.0)
+
+
+def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path: Path):
+    """If only prefill or only decode CSVs survived, per-stage attribution
+    isn't possible — must fall back to cluster-wide ratios so the run still
+    publishes something useful instead of dropping the field entirely."""
+    base = 1_700_000_000.0
+    # Only prefill CSVs — decode is missing entirely.
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)],
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=8000
+    )
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+    # power_by_worker still emitted (one prefill worker).
+    assert len(patched["power_by_worker"]) == 1
+    # J/input absent (no per-stage attribution possible).
+    assert "joules_per_input_token" not in patched
+    # J/output falls back to cluster-wide (total_energy / output_tokens).
+    assert patched["joules_per_output_token"] == pytest.approx(24_000 / 1000)
+
+
+def test_run_disagg_handles_zero_input_tokens(tmp_path: Path):
+    """total_input_tokens=0 (rare degenerate case) → joules_per_input_token
+    omitted, no ZeroDivisionError."""
+    base = 1_700_000_000.0
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)],
+    )
+    _write_nvidia_csv(
+        tmp_path / "perf_samples_decode_w0_dn0.csv",
+        [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)],
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=0
+    )
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+    assert "joules_per_input_token" not in patched
+    assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000)
+
+
+def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path):
+    """patch_agg_result emits the new optional fields when supplied."""
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+    workers = [
+        {"role": "prefill", "worker_idx": 0, "hosts": ["pn0"], "num_gpus": 4, "avg_power_w": 600.0},
+        {"role": "decode", "worker_idx": 0, "hosts": ["dn0"], "num_gpus": 4, "avg_power_w": 400.0},
+    ]
+    patch_agg_result(
+        agg,
+        avg_power_w=500.0,
+        joules_per_output_token=16.0,
+        joules_per_total_token=4.44,
+        joules_per_input_token=3.0,
+        power_by_worker=workers,
+    )
+    data = json.loads(agg.read_text())
+    assert data["avg_power_w"] == 500.0
+    assert data["joules_per_input_token"] == 3.0
+    assert data["power_by_worker"] == workers
+
+
+def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path):
+    """Backward compat: caller passing None for new fields → fields absent."""
+    agg = tmp_path / "agg.json"
+    agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8")
+    patch_agg_result(
+        agg,
+        avg_power_w=400.0,
+        joules_per_output_token=1.5,
+        joules_per_total_token=0.5,
+    )
+    data = json.loads(agg.read_text())
+    assert "joules_per_input_token" not in data
+    assert "power_by_worker" not in data
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 61d3b45fc..6b3fc9a94 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -754,3 +754,94 @@ def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, s
         agg_path = tmp_path / "agg_benchmark_result.json"
         patched = json.loads(agg_path.read_text())
         assert "avg_power_w" not in patched
+
+    def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path, multinode_env_vars):
+        """End-to-end disagg wiring: DISAGG=true + per-node labeled CSVs →
+        process_result.py passes disagg through to aggregate_power, which emits
+        power_by_worker + joules_per_input_token using per-stage attribution.
+
+        Without the disagg=disagg propagation in process_result.py, the run
+        would silently fall back to cluster-wide joules math and the user-facing
+        per-stage J/input metric would be missing."""
+        start, end = 1_700_000_100.0, 1_700_000_160.0  # 60s bench window
+        # 1 prefill worker × 4 GPUs @ 600W on its own node
+        self._write_nvidia_csv(
+            tmp_path / "perf_samples_prefill_w0_pn0.csv",
+            start, end, watts_per_gpu=600.0, num_gpus=4,
+        )
+        # 1 decode worker × 4 GPUs @ 400W on its own node
+        self._write_nvidia_csv(
+            tmp_path / "perf_samples_decode_w0_dn0.csv",
+            start, end, watts_per_gpu=400.0, num_gpus=4,
+        )
+
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 64,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 500.0,
+            "benchmark_start_time_unix": start,
+            "benchmark_end_time_unix": end,
+            "duration": 60.0,
+            "total_output_tokens": 30_000,
+            "total_input_tokens": 240_000,
+        }
+        env = {
+            **multinode_env_vars,
+            "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"),
+        }
+
+        result = run_script(tmp_path, env, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text())
+
+        # Per-stage attribution: prefill_energy / input, decode_energy / output.
+        # Prefill: 600 × 4 × 60 = 144_000 J  → / 240_000 = 0.6 J/input_tok.
+        # Decode:  400 × 4 × 60 =  96_000 J  → /  30_000 = 3.2 J/output_tok.
+        assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01)
+        assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01)
+
+        # Per-worker breakdown labeled with role.
+        workers = patched["power_by_worker"]
+        assert {w["role"] for w in workers} == {"prefill", "decode"}
+        for w in workers:
+            assert w["num_gpus"] == 4
+            assert w["worker_idx"] == 0
+
+    def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, multinode_env_vars):
+        """Multinode but DISAGG=false → keep cluster-wide ratios, no J/input.
+
+        Sanity check that the disagg flag is the gate, not just multinode-ness."""
+        start, end = 1_700_000_100.0, 1_700_000_160.0
+        self._write_nvidia_csv(
+            tmp_path / "perf_samples_agg_w0_n0.csv",
+            start, end, watts_per_gpu=500.0, num_gpus=4,
+        )
+
+        benchmark_result = {
+            "model_id": "test-model",
+            "max_concurrency": 64,
+            "total_token_throughput": 1000.0,
+            "output_throughput": 500.0,
+            "benchmark_start_time_unix": start,
+            "benchmark_end_time_unix": end,
+            "duration": 60.0,
+            "total_output_tokens": 30_000,
+            "total_input_tokens": 240_000,
+        }
+        # Multinode env, but DISAGG=false → non-disagg multinode (rare but valid).
+        env = {
+            **multinode_env_vars,
+            "DISAGG": "false",
+            "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"),
+        }
+
+        result = run_script(tmp_path, env, benchmark_result)
+        assert result.returncode == 0, f"Script failed: {result.stderr}"
+
+        patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text())
+        assert "joules_per_input_token" not in patched
+        # power_by_worker still emitted (filename labels exist) — useful for
+        # observability even on non-disagg runs.
+        assert patched["power_by_worker"][0]["role"] == "agg"

From 1af17ab305bc7b5e99f87c1b81ce4e94557f5d01 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 11:08:07 -0700
Subject: [PATCH 07/14] chore(perf-changelog): re-trigger sweep for per-worker
 power aggregation

Workflow's paths: filter only fires on perf-changelog.yaml. This bumps
the dsv4-fp4-gb300-dynamo-sglang entry so the sweep picks up the new
per-worker power + per-stage J/token aggregation from 24f46ffe.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 perf-changelog.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9fdae2fd6..506862307 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3198,4 +3198,5 @@
   description:
     - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
     - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema."
+    - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: power_by_worker[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, and joules_per_input_token using per-stage energy attribution (prefill_energy / input_tokens). joules_per_output_token and joules_per_total_token now use per-stage math for disagg runs. Backward compatible: single-node and non-disagg multinode keep cluster-wide ratios."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574

From 5b3bcbb6055bd829fbd101ab3e4284f4ebdba3b6 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 13:30:14 -0700
Subject: [PATCH 08/14] feat(power): realign agg JSON fields with
 InferenceX-app METRIC_KEYS + add temp/util/mem
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Realigns the per-worker / per-stage schema introduced in 06558b9c to
match the canonical METRIC_KEYS already declared in InferenceX-app
(packages/app/src/lib/metric-keys.ts). Previously this PR overrode
cluster-wide joules_per_output_token for disagg runs, which would
silently shift the meaning of a shared field. New per-stage values are
emitted as separate flat scalars so the cluster keys stay byte-stable.

Schema changes:
  - Revert disagg override on joules_per_output_token and
    joules_per_total_token — both are now ALWAYS cluster-wide
    (total_system_energy / token_count), matching single-node math
    and the frontend's existing axis labels.
  - Add new disagg-only flat scalars (already in frontend METRIC_KEYS):
      prefill_avg_power_w           cluster mean across prefill workers
      decode_avg_power_w            cluster mean across decode workers
      joules_per_output_token_decode  decode_energy / output_tokens
    joules_per_input_token unchanged (prefill_energy / input_tokens).
  - Rename power_by_worker[] -> workers[] to match
    InferenceX-app's BenchmarkRow.workers / WorkerPower interface.
  - Each workers[] entry extended with per-worker telemetry:
      avg_temp_c, peak_temp_c, avg_util_pct, avg_mem_used_mb
  - Add matching cluster-wide telemetry scalars (per-GPU mean, omitted
    when CSV lacks the column).

Implementation:
  - _read_samples + _aggregate_rows refactored to extract all metric
    columns in one pass (single-vendor regex per metric, gracefully
    degrades when a column is absent).
  - aggregate_power() preserved as a thin compat wrapper returning the
    old (power, num_gpus) tuple so external callers don't break.
  - Per-stage prefill_avg_power_w / decode_avg_power_w use weighted
    mean by num_gpus (matches how cluster avg_power_w is computed).
  - Frontend-labeled CSVs still excluded from per-stage energy
    attribution; included in cluster totals.

Tests: 107/107 pass (88 existing baseline preserved, 14 new telemetry
tests, 5 schema-renamed tests updated in place). New coverage: temp /
util / mem extraction across NVIDIA + AMD + srt-slurm CSV schemas,
peak vs avg distinction, missing-column graceful degradation, per-
worker telemetry, per-stage weighted-mean scalars.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 perf-changelog.yaml           |  15 +-
 utils/aggregate_power.py      | 504 ++++++++++++++++++-------
 utils/test_aggregate_power.py | 677 ++++++++++++++++++++++++++++++++--
 utils/test_process_result.py  |  39 +-
 4 files changed, 1053 insertions(+), 182 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 506862307..62bedab67 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3198,5 +3198,18 @@
   description:
     - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
     - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema."
-    - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: power_by_worker[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, and joules_per_input_token using per-stage energy attribution (prefill_energy / input_tokens). joules_per_output_token and joules_per_total_token now use per-stage math for disagg runs. Backward compatible: single-node and non-disagg multinode keep cluster-wide ratios."
+    - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: workers[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, plus per-stage scalars (prefill_avg_power_w, decode_avg_power_w, joules_per_input_token = prefill_energy / input_tokens, joules_per_output_token_decode = decode_energy / output_tokens). joules_per_output_token and joules_per_total_token stay cluster-wide on all topologies so the metric is comparable across single-node, multinode-agg, and multinode-disagg. Per-stage scalars emitted only for disagg runs with both prefill and decode workers present. workers[] entries also carry per-worker avg_temp_c/peak_temp_c/avg_util_pct/avg_mem_used_mb when the CSV exposes those columns."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-disagg
+    - glm5-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg
+    - kimik2.5-fp4-mi355x-vllm-disagg
+    - minimaxm2.5-fp8-mi355x-vllm-disagg
+  description:
+    - "Smoke run validating AMD multinode measured-power aggregation — the AMD analogue of the NVIDIA gb300/srt-slurm path (PR #1574). No config change; entry exists to trigger a sweep that produces the first AMD multinode agg JSONs with avg_power_w + joules_per_*_token + per-worker workers[] populated from per-node amd-smi perfmon CSVs."
+    - "The AMD amd_utils SLURM job has no orchestrator perfmon, so each SGLang/vLLM disagg node starts its own amd-smi monitor via start_perf_monitor (benchmarks/benchmark_lib.sh), writing perf_samples_<role>_w<idx>_<host>.csv into the NFS-shared /benchmark_logs/perfmon mount (wired in amd_utils/job.slurm). launch_mi355x-amds.sh collects the per-node CSVs into the GH workspace before the EXIT trap wipes the logs dir and sets GPU_METRICS_CSV_GLOB so the existing Process-result step runs the same vendor-agnostic utils/aggregate_power.py used for NVIDIA: per-source GPU-id namespacing (8 GPUs/node on MI355X, so a TP16 worker over 2 nodes counts 16 GPUs not 8), per-stage prefill/decode energy attribution, and per-worker temp/util/mem when amd-smi exposes those columns."
+    - "Covers both engine paths: SGLang disagg (server_sglang.sh role = NODE_RANK bucketed by PREFILL_NODES_PER_WORKER / NODE_OFFSET) and vLLM disagg (server_vllm.sh one worker per node, ranks [0,xP) prefill / [xP,xP+yD) decode). Monitoring is best-effort end-to-end — a missing amd-smi or empty CSV skips power patching without failing the benchmark upload; DISAGG=true threads through to per-stage attribution while agg/non-disagg runs still get cluster-wide power."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574
diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index 962c9167d..ee4327b83 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -1,44 +1,75 @@
-"""Aggregate measured GPU power from a vendor SMI CSV into the agg result JSON.
+"""Aggregate measured GPU telemetry (power, temp, utilization, memory) from a
+vendor SMI CSV into the agg result JSON.
 
 Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi)
 or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark
 load window using start/end Unix timestamps written by benchmark_serving.py, and
-patches the aggregated result JSON with cluster-wide and per-worker power data
+patches the aggregated result JSON with cluster-wide and per-worker telemetry
 consumed by InferenceX-app's ETL.
 
 Cluster-wide fields (always written when any power data exists):
   - avg_power_w:               mean per-GPU power draw (W) during the load window
-  - joules_per_output_token:   energy / total_output_tokens
-  - joules_per_total_token:    energy / (input + output) tokens
-
-For disaggregated multinode runs (DISAGG=true), the numerator for the J/token
-metrics shifts to a per-stage attribution: prefill workers' energy is divided
-by input tokens, decode workers' energy by output tokens. Per-stage power is
-where the meaningful efficiency signal lives — total-energy ratios mostly just
-re-scale the same number by different denominators.
-
-  - joules_per_input_token:    prefill_energy / total_input_tokens (disagg only)
-  - joules_per_output_token:   decode_energy / total_output_tokens   (overridden)
-  - joules_per_total_token:    (prefill_energy + decode_energy) / total_tokens (overridden)
-
-Per-worker breakdown (multinode only — single-node has no role concept):
-  - power_by_worker: list of {role, worker_idx, hosts[], num_gpus, avg_power_w}
-                     where role is "prefill", "decode", "agg", or "frontend".
-
-srt-slurm encodes the worker role and index in the perfmon CSV filename:
-`perf_samples_<role>_w<worker_idx>_<host>.csv` — see srt-slurm fork's
-benchmark_stage._start_perf_monitor. Filenames that don't match this pattern
-(e.g. single-node `gpu_metrics.csv`) fall back to a single cluster-wide bucket.
+  - joules_per_output_token:   total_system_energy / total_output_tokens
+                               (cluster-wide; always — same math single-node and
+                               multinode disagg, so the metric stays comparable
+                               across topologies in the dashboard)
+  - joules_per_total_token:    total_system_energy / (input + output) tokens
+                               (cluster-wide; always)
+  - avg_temp_c:                mean per-GPU temperature (Celsius), when the
+                               CSV exposes a temperature column
+  - peak_temp_c:               max instantaneous per-GPU temperature in window
+  - avg_util_pct:              mean per-GPU GPU-utilization percent
+  - avg_mem_used_mb:           mean per-GPU memory used (MiB/MB)
+
+For disaggregated multinode runs (DISAGG=true) where filenames carry the perfmon
+role/index encoding AND both prefill+decode workers are present, additional flat
+per-stage scalars are emitted alongside (NOT instead of) the cluster-wide keys:
+
+  - prefill_avg_power_w:           per-GPU mean power across prefill workers
+  - decode_avg_power_w:            per-GPU mean power across decode workers
+  - joules_per_input_token:        prefill_energy / total_input_tokens
+                                   (per-stage attribution — prefill processes
+                                   input tokens, so its energy / input gives the
+                                   prefill-side per-token cost)
+  - joules_per_output_token_decode: decode_energy / total_output_tokens
+                                   (per-stage attribution; the _decode suffix is
+                                   load-bearing — keeps the cluster-wide
+                                   joules_per_output_token comparable across
+                                   single-node and disagg deployments and exposes
+                                   decode-only energy as a separate key for users
+                                   who specifically want it.)
+
+Per-worker breakdown (multinode only — single-node has no role concept), emitted
+under the `workers` key to match InferenceX-app's BenchmarkRow.workers shape:
+  - workers: list of {role, worker_idx, hosts[], num_gpus, avg_power_w,
+                       avg_temp_c?, peak_temp_c?, avg_util_pct?, avg_mem_used_mb?}
+             where role is "prefill", "decode", "agg", or "frontend".
+
+Both multinode paths encode the worker role and index in the perfmon CSV
+filename: `perf_samples_<role>_w<worker_idx>_<host>.csv` — NVIDIA via the
+srt-slurm fork's benchmark_stage._start_perf_monitor, AMD via start_perf_monitor
+in benchmarks/benchmark_lib.sh (each SGLang/vLLM disagg node starts its own
+amd-smi monitor). Filenames that don't match this pattern (e.g. single-node
+`gpu_metrics.csv`) fall back to a single cluster-wide bucket.
 
 Multinode: accepts multiple CSV paths (one per worker node). GPU indices are
 namespaced by source CSV stem to avoid the same-index collision across nodes —
 e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4
 total GPUs instead of 32.
 
-Vendor schema detection is regex-based: any timestamp-like column + any column
-whose name contains "power" (excluding "limit"/"cap"/"max") is picked up.
-NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's
-perfmon emits "power_w". All are handled.
+Vendor schema detection is regex-based:
+  - Power: timestamp + column whose name contains "power" (excluding
+    "limit"/"cap"/"max"/"min"). NVIDIA: "power.draw [W]". AMD: "socket_power".
+    srt-slurm: "power_w".
+  - Temperature: column name contains "temp". NVIDIA: "temperature.gpu". AMD:
+    "temperature". srt-slurm: "temp_c". Unit: Celsius.
+  - Utilization: column name starts with "utilization" or contains "util".
+    NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent.
+  - Memory: column name contains "mem" but not "total" (avoid "memory.total").
+    NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB.
+
+Power is required for aggregation to fire; the other metrics degrade gracefully
+when their columns are absent (those fields are simply omitted from the output).
 
 This script is best-effort. Missing or malformed CSV exits 0 without patching
 so a monitoring hiccup never breaks the benchmark upload.
@@ -60,6 +91,10 @@
 
 _POWER_COL_RE = re.compile(r"power", re.IGNORECASE)
 _POWER_EXCLUDE_RE = re.compile(r"limit|cap|max|min", re.IGNORECASE)
+_TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE)
+_UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE)
+_MEM_COL_RE = re.compile(r"mem", re.IGNORECASE)
+_MEM_EXCLUDE_RE = re.compile(r"total", re.IGNORECASE)
 _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE)
 _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE)
 _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?")
@@ -72,6 +107,11 @@
     r"^perf_samples_(?P<role>prefill|decode|agg|frontend)_w(?P<idx>\d+)_(?P<host>.+)$"
 )
 
+# Metric names recognized in the multi-metric row dicts. Power is special-cased
+# as required; others are best-effort.
+_METRICS_AVG = ("power", "temp", "util", "mem")  # mean across samples
+_METRICS_MAX = ("temp",)  # additionally compute peak (max raw)
+
 
 def _parse_timestamp(value: str) -> float | None:
     """Best-effort timestamp parse to Unix epoch seconds (local wall clock).
@@ -107,11 +147,12 @@ def _parse_timestamp(value: str) -> float | None:
     return dt.astimezone(timezone.utc).timestamp()
 
 
-def _parse_power(value: str) -> float | None:
-    """Extract the first numeric value from a power cell.
+def _parse_numeric_cell(value: str) -> float | None:
+    """Extract the first numeric value from a cell.
 
-    nvidia-smi formats power as "412.34 W"; some configurations report
-    "[N/A]" when power capping is disabled. AMD reports a bare number.
+    Vendors decorate values with units ("412.34 W", "65 C", "85 %", "1024 MiB")
+    or report "[N/A]" when a sensor is unavailable. We strip and pull the first
+    signed-decimal token; returns None for empty / NA / non-numeric cells.
     """
     value = value.strip()
     if not value or value.lower() in {"[n/a]", "n/a", "na"}:
@@ -125,12 +166,19 @@ def _parse_power(value: str) -> float | None:
         return None
 
 
+# Back-compat shim — some external callers may have imported _parse_power.
+_parse_power = _parse_numeric_cell
+
+
 def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | None]:
     """Return (timestamp_col, power_col, gpu_index_col) from a CSV header.
 
     Power column: contains "power" and not "limit"/"cap"/"max"/"min".
     Timestamp column: contains "time".
     GPU index column: optional — used to count distinct GPUs per sample.
+
+    Kept for back-compat with tests that imported _detect_columns directly;
+    new code uses _detect_all_columns to also pick up temp/util/mem.
     """
     timestamp_col = next((c for c in header if _TIMESTAMP_COL_RE.search(c)), None)
     power_col = next(
@@ -141,6 +189,39 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No
     return timestamp_col, power_col, gpu_col
 
 
+def _detect_all_columns(header: list[str]) -> dict[str, str | None]:
+    """Return a mapping of role -> column name for every metric we know about.
+
+    Roles: timestamp, gpu, power, temp, util, mem. Missing roles map to None.
+
+    The detection is greedy + first-match: with a vendor like NVIDIA whose
+    header lists `utilization.gpu` followed by `utilization.memory`, the
+    util slot picks the first; that's fine — we only need ONE util column and
+    `utilization.gpu` is the canonical one. Memory excludes "total" so
+    `memory.used` wins over `memory.total`.
+    """
+    timestamp_col = next((c for c in header if _TIMESTAMP_COL_RE.search(c)), None)
+    power_col = next(
+        (c for c in header if _POWER_COL_RE.search(c) and not _POWER_EXCLUDE_RE.search(c)),
+        None,
+    )
+    temp_col = next((c for c in header if _TEMP_COL_RE.search(c)), None)
+    util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None)
+    mem_col = next(
+        (c for c in header if _MEM_COL_RE.search(c) and not _MEM_EXCLUDE_RE.search(c)),
+        None,
+    )
+    gpu_col = next((c for c in header if _GPU_INDEX_COL_RE.match(c.strip())), None)
+    return {
+        "timestamp": timestamp_col,
+        "gpu": gpu_col,
+        "power": power_col,
+        "temp": temp_col,
+        "util": util_col,
+        "mem": mem_col,
+    }
+
+
 def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None:
     """Extract (role, worker_idx, host) from a srt-slurm perfmon CSV filename.
 
@@ -156,12 +237,16 @@ def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None:
 
 def _read_samples(
     path: Path, start_unix: float, end_unix: float
-) -> tuple[list[tuple[float, float, str | None]], bool] | None:
-    """Read one CSV → list of (timestamp_bucket, power_w, gpu_id) in window.
+) -> tuple[list[tuple[float, str | None, dict[str, float]]], bool] | None:
+    """Read one CSV → list of (timestamp_bucket, gpu_id, {metric: value}) in window.
 
     Returns (rows, saw_gpu_col) on success, None if the file is unreadable /
-    missing the required columns. Empty rows list is valid (file readable but
-    no samples landed in the window).
+    missing the required power column. Empty rows list is valid (file readable
+    but no samples landed in the window).
+
+    Each row's metric dict carries whichever of power/temp/util/mem the CSV
+    exposed (power is always present — rows lacking it are skipped). Missing
+    metric columns simply don't appear in the dict; callers gracefully degrade.
     """
     if not path.is_file() or path.stat().st_size == 0:
         return None
@@ -170,72 +255,139 @@ def _read_samples(
             reader = csv.DictReader(f, skipinitialspace=True)
             header = [c.strip() for c in (reader.fieldnames or [])]
             reader.fieldnames = header
-            timestamp_col, power_col, gpu_col = _detect_columns(header)
+            cols = _detect_all_columns(header)
+            timestamp_col = cols["timestamp"]
+            power_col = cols["power"]
             if not timestamp_col or not power_col:
                 return None
-            rows: list[tuple[float, float, str | None]] = []
+            gpu_col = cols["gpu"]
+            # Map metric name -> CSV column. Power is required (we just
+            # checked); temp/util/mem are optional.
+            metric_cols: dict[str, str] = {"power": power_col}
+            for metric in ("temp", "util", "mem"):
+                col = cols[metric]
+                if col is not None:
+                    metric_cols[metric] = col
+            rows: list[tuple[float, str | None, dict[str, float]]] = []
             for row in reader:
                 ts = _parse_timestamp((row.get(timestamp_col) or "").strip())
-                pw = _parse_power((row.get(power_col) or "").strip())
-                if ts is None or pw is None:
+                if ts is None:
                     continue
                 if ts < start_unix or ts > end_unix:
                     continue
+                # Power must parse; rows with [N/A] or empty power are useless
+                # for aggregation (same behavior as before the multi-metric
+                # extension).
+                pw = _parse_numeric_cell((row.get(power_col) or "").strip())
+                if pw is None:
+                    continue
+                values: dict[str, float] = {"power": pw}
+                for metric, col in metric_cols.items():
+                    if metric == "power":
+                        continue
+                    v = _parse_numeric_cell((row.get(col) or "").strip())
+                    if v is not None:
+                        values[metric] = v
                 gpu_id = (row.get(gpu_col) or "").strip() if gpu_col else None
-                rows.append((round(ts, 3), pw, gpu_id or None))
+                rows.append((round(ts, 3), gpu_id or None, values))
             return rows, gpu_col is not None
     except (OSError, csv.Error):
         return None
 
 
 def _aggregate_rows(
-    sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]],
+    sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]],
     *,
     namespace: bool,
-) -> tuple[float, int] | None:
-    """Merge rows across CSVs into (per_gpu_avg_power_w, num_gpus).
+) -> dict | None:
+    """Merge rows across CSVs into a metric-dict + num_gpus.
 
     `sources` is a list of (path, rows, saw_gpu_col) for the CSVs to roll up
     together. Rows are bucketed by ms-rounded timestamp so nodes with sub-ms
     clock drift land in the same bucket. GPU indices are namespaced by the
     source path's stem when `namespace=True` (multi-source case) to keep
     same-local-index across nodes from collapsing.
+
+    Returns a dict with at minimum {"power": float, "num_gpus": int}. Each
+    additional metric (temp/util/mem) is included only when at least one
+    source emitted it. peak_temp is the global max across the window
+    (instantaneous, not per-bucket-mean).
     """
-    per_sample_total: dict[float, float] = {}
-    per_sample_row_count: dict[float, int] = {}
+    # Per-bucket totals keyed by metric name. Bucket = ms-rounded timestamp.
+    per_sample_total: dict[str, dict[float, float]] = {m: {} for m in _METRICS_AVG}
+    per_sample_count: dict[str, dict[float, int]] = {m: {} for m in _METRICS_AVG}
+    per_sample_row_count: dict[float, int] = {}  # for no-gpu-col GPU inference
     per_sample_gpus: dict[float, set[str]] = {}
     gpu_keys: set[str] = set()
     saw_gpu_col_any = False
+    saw_metric: dict[str, bool] = {m: False for m in _METRICS_AVG}
+    peak_per_metric: dict[str, float] = {}
 
     for path, rows, saw_gpu_col in sources:
         if saw_gpu_col:
             saw_gpu_col_any = True
-        for bucket, pw, gpu_id in rows:
-            per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw
+        for bucket, gpu_id, values in rows:
             per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1
+            for metric, v in values.items():
+                if metric not in per_sample_total:
+                    continue
+                per_sample_total[metric][bucket] = (
+                    per_sample_total[metric].get(bucket, 0.0) + v
+                )
+                per_sample_count[metric][bucket] = (
+                    per_sample_count[metric].get(bucket, 0) + 1
+                )
+                saw_metric[metric] = True
+                if metric in _METRICS_MAX:
+                    cur = peak_per_metric.get(metric)
+                    peak_per_metric[metric] = v if cur is None else max(cur, v)
             if gpu_id is not None:
                 ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id
                 per_sample_gpus.setdefault(bucket, set()).add(ns_id)
                 gpu_keys.add(ns_id)
 
-    if not per_sample_total:
+    if not per_sample_total["power"]:
         return None
 
+    # GPU count:
     # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs.
     # - Otherwise, infer from row count (one row per GPU per sample, summed
     #   across all paths' rows that fell into the same timestamp bucket).
     if saw_gpu_col_any and gpu_keys:
         num_gpus = len(gpu_keys)
-        per_sample_mean_per_gpu = [
-            total / max(len(per_sample_gpus.get(ts, ())), 1)
-            for ts, total in per_sample_total.items()
-        ]
     else:
         num_gpus = max(per_sample_row_count.values())
-        per_sample_mean_per_gpu = [
-            total / per_sample_row_count[ts] for ts, total in per_sample_total.items()
-        ]
-    return mean(per_sample_mean_per_gpu), num_gpus
+
+    def _avg_per_gpu(metric: str) -> float | None:
+        if not saw_metric.get(metric):
+            return None
+        totals = per_sample_total[metric]
+        if not totals:
+            return None
+        if saw_gpu_col_any and gpu_keys:
+            # bucket mean = sum / distinct GPU count in that bucket
+            per_sample_mean = [
+                total / max(len(per_sample_gpus.get(ts, ())), 1)
+                for ts, total in totals.items()
+            ]
+        else:
+            # bucket mean = sum / row count in that bucket (= GPU count when
+            # one row per GPU per sample, the universal vendor convention)
+            per_sample_mean = [
+                total / per_sample_count[metric][ts] for ts, total in totals.items()
+            ]
+        return mean(per_sample_mean) if per_sample_mean else None
+
+    result: dict = {"num_gpus": num_gpus, "power": _avg_per_gpu("power")}
+    for metric in ("temp", "util", "mem"):
+        avg = _avg_per_gpu(metric)
+        if avg is not None:
+            result[metric] = avg
+    # Peak (max raw value, not per-bucket-mean): meaningful for temperature
+    # where the worst-case GPU's hottest sample is the thermal-headroom signal.
+    if "temp" in peak_per_metric:
+        result["peak_temp"] = peak_per_metric["temp"]
+    return result
 
 
 def aggregate_power(
@@ -245,6 +397,23 @@ def aggregate_power(
 ) -> tuple[float, int] | None:
     """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end].
 
+    Backward-compatible wrapper around aggregate_metrics that returns just the
+    legacy (avg_power_w, num_gpus) tuple for callers (and tests) that don't
+    need temperature/util/memory.
+    """
+    res = aggregate_metrics(csv_path, start_unix, end_unix)
+    if res is None:
+        return None
+    return res["power"], res["num_gpus"]
+
+
+def aggregate_metrics(
+    csv_path: Path | Iterable[Path],
+    start_unix: float,
+    end_unix: float,
+) -> dict | None:
+    """Return a dict of cluster-wide per-GPU metrics for samples in [start, end].
+
     Accepts either a single Path (single-node case) or an iterable of Paths
     (multinode case: one CSV per worker node, all written by srt-slurm's
     perfmon). For multi-path inputs, GPU indices are namespaced by source
@@ -254,12 +423,15 @@ def aggregate_power(
 
     Returns None if no CSVs are usable, none have a detectable power column,
     or no rows fall in the window across all paths.
+
+    Result keys: num_gpus, power (always when not None); temp, util, mem,
+    peak_temp (only when the corresponding column existed in at least one CSV).
     """
     paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
     if not paths or end_unix <= start_unix:
         return None
 
-    sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = []
+    sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]] = []
     for path in paths:
         read = _read_samples(path, start_unix, end_unix)
         if read is None:
@@ -277,9 +449,13 @@ def aggregate_power_by_worker(
     start_unix: float,
     end_unix: float,
 ) -> list[dict] | None:
-    """Group CSVs by (role, worker_idx) and return per-worker power rollups.
+    """Group CSVs by (role, worker_idx) and return per-worker telemetry rollups.
+
+    Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w,
+                  avg_temp_c?, peak_temp_c?, avg_util_pct?, avg_mem_used_mb?}.
+    The optional fields appear only when the CSVs for that worker carried
+    temperature / utilization / memory columns.
 
-    Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w}.
     Returns None if no CSVs have parseable filenames OR no labeled CSV yields
     usable samples. Unlabeled CSVs in the input are silently skipped — they
     can't be attributed to a worker.
@@ -309,7 +485,7 @@ def aggregate_power_by_worker(
 
     out: list[dict] = []
     for (role, worker_idx), worker_paths in by_worker.items():
-        sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = []
+        sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]] = []
         for path in worker_paths:
             read = _read_samples(path, start_unix, end_unix)
             if read is None:
@@ -323,16 +499,22 @@ def aggregate_power_by_worker(
         result = _aggregate_rows(sources, namespace=len(sources) > 1)
         if result is None:
             continue
-        avg_power_w, num_gpus = result
-        out.append(
-            {
-                "role": role,
-                "worker_idx": worker_idx,
-                "hosts": sorted(hosts_by_worker[(role, worker_idx)]),
-                "num_gpus": num_gpus,
-                "avg_power_w": round(avg_power_w, 3),
-            }
-        )
+        entry: dict = {
+            "role": role,
+            "worker_idx": worker_idx,
+            "hosts": sorted(hosts_by_worker[(role, worker_idx)]),
+            "num_gpus": result["num_gpus"],
+            "avg_power_w": round(result["power"], 3),
+        }
+        if "temp" in result:
+            entry["avg_temp_c"] = round(result["temp"], 3)
+        if "peak_temp" in result:
+            entry["peak_temp_c"] = round(result["peak_temp"], 3)
+        if "util" in result:
+            entry["avg_util_pct"] = round(result["util"], 3)
+        if "mem" in result:
+            entry["avg_mem_used_mb"] = round(result["mem"], 3)
+        out.append(entry)
     if not out:
         return None
     # Stable order: role (prefill < decode < agg < frontend), then worker_idx.
@@ -418,13 +600,21 @@ def patch_agg_result(
     joules_per_output_token: float,
     joules_per_total_token: float,
     joules_per_input_token: float | None = None,
-    power_by_worker: list[dict] | None = None,
+    joules_per_output_token_decode: float | None = None,
+    prefill_avg_power_w: float | None = None,
+    decode_avg_power_w: float | None = None,
+    avg_temp_c: float | None = None,
+    peak_temp_c: float | None = None,
+    avg_util_pct: float | None = None,
+    avg_mem_used_mb: float | None = None,
+    workers: list[dict] | None = None,
 ) -> None:
-    """Read the agg JSON, add the power keys, and write it back atomically.
+    """Read the agg JSON, add the telemetry keys, and write it back atomically.
 
-    `joules_per_input_token` and `power_by_worker` are optional — omitted from
-    the JSON when None (kept that way so single-node and non-disagg multinode
-    agg JSONs don't gain meaningless null fields).
+    All optional fields (anything except avg_power_w / joules_per_output_token /
+    joules_per_total_token) are omitted from the JSON when None — keeps the
+    pre-disagg / single-node agg JSONs from gaining meaningless null fields, and
+    keeps non-power-instrumented runs (e.g. no temp sensor) from emitting nulls.
     """
     data = json.loads(agg_path.read_text(encoding="utf-8"))
     data["avg_power_w"] = round(avg_power_w, 3)
@@ -432,39 +622,82 @@ def patch_agg_result(
     data["joules_per_total_token"] = round(joules_per_total_token, 6)
     if joules_per_input_token is not None:
         data["joules_per_input_token"] = round(joules_per_input_token, 6)
-    if power_by_worker is not None:
-        data["power_by_worker"] = power_by_worker
+    if joules_per_output_token_decode is not None:
+        data["joules_per_output_token_decode"] = round(joules_per_output_token_decode, 6)
+    if prefill_avg_power_w is not None:
+        data["prefill_avg_power_w"] = round(prefill_avg_power_w, 3)
+    if decode_avg_power_w is not None:
+        data["decode_avg_power_w"] = round(decode_avg_power_w, 3)
+    if avg_temp_c is not None:
+        data["avg_temp_c"] = round(avg_temp_c, 3)
+    if peak_temp_c is not None:
+        data["peak_temp_c"] = round(peak_temp_c, 3)
+    if avg_util_pct is not None:
+        data["avg_util_pct"] = round(avg_util_pct, 3)
+    if avg_mem_used_mb is not None:
+        data["avg_mem_used_mb"] = round(avg_mem_used_mb, 3)
+    if workers is not None:
+        data["workers"] = workers
     tmp_path = agg_path.with_suffix(agg_path.suffix + ".tmp")
     tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8")
     tmp_path.replace(agg_path)
 
 
-def _disagg_stage_energies(
-    power_by_worker: list[dict], duration: float
-) -> tuple[float, float] | None:
-    """Sum per-worker energy for prefill vs decode workers (J).
+def _disagg_stage_rollup(
+    workers: list[dict], duration: float
+) -> dict | None:
+    """Roll up per-worker entries into per-stage energy + per-GPU mean power.
 
-    Returns (prefill_energy_j, decode_energy_j) or None if either stage is
-    absent — without both stages we can't do per-stage attribution and the
-    caller should fall back to total-energy math.
+    Returns a dict with keys:
+      - prefill_energy_j, decode_energy_j: sum of (avg_power_w * num_gpus *
+        duration) across workers in each role
+      - prefill_avg_power_w, decode_avg_power_w: per-GPU mean power weighted
+        by num_gpus (matches the cluster avg_power_w semantics, but scoped to
+        each role)
+
+    Returns None if either stage is absent — without both stages we can't do
+    per-stage attribution and the caller should fall back to total-energy math.
     """
-    prefill_e = 0.0
-    decode_e = 0.0
+    prefill_energy = 0.0
+    decode_energy = 0.0
+    prefill_gpus = 0
+    decode_gpus = 0
+    prefill_pw_x_gpus = 0.0
+    decode_pw_x_gpus = 0.0
     has_prefill = False
     has_decode = False
-    for w in power_by_worker:
+    for w in workers:
         e = w["avg_power_w"] * w["num_gpus"] * duration
         if w["role"] == "prefill":
-            prefill_e += e
+            prefill_energy += e
+            prefill_gpus += w["num_gpus"]
+            prefill_pw_x_gpus += w["avg_power_w"] * w["num_gpus"]
             has_prefill = True
         elif w["role"] == "decode":
-            decode_e += e
+            decode_energy += e
+            decode_gpus += w["num_gpus"]
+            decode_pw_x_gpus += w["avg_power_w"] * w["num_gpus"]
             has_decode = True
         # "frontend" / "agg" / unknown roles deliberately excluded — they
-        # don't belong to either stage's per-token cost.
+        # don't belong to either stage's per-token cost or per-stage power.
     if not (has_prefill and has_decode):
         return None
-    return prefill_e, decode_e
+    return {
+        "prefill_energy_j": prefill_energy,
+        "decode_energy_j": decode_energy,
+        "prefill_avg_power_w": prefill_pw_x_gpus / prefill_gpus if prefill_gpus else None,
+        "decode_avg_power_w": decode_pw_x_gpus / decode_gpus if decode_gpus else None,
+    }
+
+
+# Backward-compat shim — the original API returned just the two energy values.
+def _disagg_stage_energies(
+    workers: list[dict], duration: float
+) -> tuple[float, float] | None:
+    res = _disagg_stage_rollup(workers, duration)
+    if res is None:
+        return None
+    return res["prefill_energy_j"], res["decode_energy_j"]
 
 
 def run(
@@ -484,8 +717,8 @@ def run(
     start, end, duration, total_output, total_input = window
 
     paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path)
-    result = aggregate_power(paths, start, end)
-    if result is None:
+    cluster = aggregate_metrics(paths, start, end)
+    if cluster is None:
         label = str(paths[0]) if len(paths) == 1 else f"{len(paths)} CSVs"
         print(
             f"[aggregate_power] No usable power samples in {label} for "
@@ -493,53 +726,55 @@ def run(
             file=sys.stderr,
         )
         return 0
-    avg_power_w, num_gpus = result
+    avg_power_w = cluster["power"]
+    num_gpus = cluster["num_gpus"]
+    avg_temp_c = cluster.get("temp")
+    peak_temp_c = cluster.get("peak_temp")
+    avg_util_pct = cluster.get("util")
+    avg_mem_used_mb = cluster.get("mem")
 
     # Per-worker rollup is best-effort: only emitted when CSV filenames carry
     # the perfmon role/index encoding. Single-node `gpu_metrics.csv` won't
     # parse, so aggregate_power_by_worker returns None and the field is omitted.
-    power_by_worker = aggregate_power_by_worker(paths, start, end)
-
-    # Cluster-wide energy baseline. Used as the fallback numerator when
-    # per-stage attribution isn't available.
+    workers = aggregate_power_by_worker(paths, start, end)
+
+    # Cluster-wide energy + per-token attribution. We ALWAYS report
+    # joules_per_output_token / joules_per_total_token as cluster-wide ratios
+    # (total_system_energy / token_count), regardless of disagg. This keeps the
+    # metric comparable across single-node, multinode-agg, and multinode-disagg
+    # topologies in the dashboard. Per-stage attribution lives in separate
+    # *_decode / joules_per_input_token keys (only emitted when disagg AND both
+    # stages present).
     total_system_energy_j = avg_power_w * num_gpus * duration
     total_tokens = total_output + total_input
+    joules_per_output_token = total_system_energy_j / total_output
+    joules_per_total_token = (
+        total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
+    )
 
     joules_per_input_token: float | None = None
+    joules_per_output_token_decode: float | None = None
+    prefill_avg_power_w: float | None = None
+    decode_avg_power_w: float | None = None
 
-    if disagg and power_by_worker is not None:
-        stage = _disagg_stage_energies(power_by_worker, duration)
+    if disagg and workers is not None:
+        stage = _disagg_stage_rollup(workers, duration)
         if stage is not None:
-            prefill_energy_j, decode_energy_j = stage
             # Per-stage attribution: prefill workers process input tokens,
             # decode workers process output tokens. Strictly more accurate
             # than total-energy ratios when prefill/decode have different
             # per-GPU power profiles (typical: prefill is compute-bound and
-            # draws more than memory-bound decode).
-            joules_per_output_token = decode_energy_j / total_output
-            joules_per_input_token = (
-                prefill_energy_j / total_input if total_input > 0 else None
+            # draws more than memory-bound decode). Exposed as additional
+            # flat scalars so the cluster-wide joules_per_output_token stays
+            # comparable across topologies.
+            prefill_avg_power_w = stage["prefill_avg_power_w"]
+            decode_avg_power_w = stage["decode_avg_power_w"]
+            joules_per_output_token_decode = (
+                stage["decode_energy_j"] / total_output
             )
-            joules_per_total_token = (
-                (prefill_energy_j + decode_energy_j) / total_tokens
-                if total_tokens > 0
-                else joules_per_output_token
-            )
-        else:
-            # disagg=true but workers don't split into prefill+decode (e.g.
-            # only one role's CSVs survived). Fall back to cluster math.
-            joules_per_output_token = total_system_energy_j / total_output
-            joules_per_total_token = (
-                total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
+            joules_per_input_token = (
+                stage["prefill_energy_j"] / total_input if total_input > 0 else None
             )
-    else:
-        # Single-node or non-disagg multinode: keep the cluster-wide ratios
-        # backward-compatible with everything that consumed the pre-disagg
-        # schema.
-        joules_per_output_token = total_system_energy_j / total_output
-        joules_per_total_token = (
-            total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
-        )
 
     if not agg_result.is_file():
         print(
@@ -555,14 +790,21 @@ def run(
             joules_per_output_token,
             joules_per_total_token,
             joules_per_input_token=joules_per_input_token,
-            power_by_worker=power_by_worker,
+            joules_per_output_token_decode=joules_per_output_token_decode,
+            prefill_avg_power_w=prefill_avg_power_w,
+            decode_avg_power_w=decode_avg_power_w,
+            avg_temp_c=avg_temp_c,
+            peak_temp_c=peak_temp_c,
+            avg_util_pct=avg_util_pct,
+            avg_mem_used_mb=avg_mem_used_mb,
+            workers=workers,
         )
     except (OSError, json.JSONDecodeError) as exc:
         print(f"[aggregate_power] Failed to patch {agg_result}: {exc}", file=sys.stderr)
         return 0
 
     worker_summary = (
-        f"workers={len(power_by_worker)}" if power_by_worker else "workers=cluster-only"
+        f"workers={len(workers)}" if workers else "workers=cluster-only"
     )
     jpit_summary = (
         f"joules_per_input_token={joules_per_input_token:.4f} "
@@ -612,10 +854,12 @@ def main() -> int:
     parser.add_argument(
         "--disagg",
         action="store_true",
-        help="Treat as disaggregated inference: emit joules_per_input_token using "
-        "per-stage energy attribution (prefill workers' energy / input tokens, "
-        "decode workers' energy / output tokens). Requires CSV filenames to carry "
-        "the perfmon role/index encoding.",
+        help="Treat as disaggregated inference: emit prefill_avg_power_w, "
+        "decode_avg_power_w, joules_per_input_token, and "
+        "joules_per_output_token_decode using per-stage energy attribution "
+        "(prefill workers' energy / input tokens, decode workers' energy / "
+        "output tokens). Requires CSV filenames to carry the perfmon role/index "
+        "encoding.",
     )
     args = parser.parse_args()
 
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index ed6ca69ab..fb33ea265 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -23,10 +23,12 @@
 sys.path.insert(0, str(Path(__file__).parent))
 
 from aggregate_power import (  # noqa: E402
+    _detect_all_columns,
     _detect_columns,
     _parse_perfmon_label,
     _parse_power,
     _parse_timestamp,
+    aggregate_metrics,
     aggregate_power,
     aggregate_power_by_worker,
     patch_agg_result,
@@ -824,15 +826,20 @@ def test_aggregate_power_by_worker_skips_unlabeled_silently(tmp_path: Path):
 # --------------------------------------------------------------------------- #
 
 
-def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path):
-    """Full disagg pipeline: per-worker breakdown + per-stage J/input + J/output.
+def test_run_disagg_emits_workers_and_per_stage_joules(tmp_path: Path):
+    """Full disagg pipeline: workers[] breakdown + per-stage scalars next to
+    cluster-wide joules.
 
     Topology: 2 prefill workers × 4 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W.
     Over a 10s bench window with 8000 input + 1000 output tokens:
-      - prefill energy = 600 × 8 × 10 = 48_000 J  → J/input = 48_000 / 8000 = 6.0
-      - decode energy  = 400 × 8 × 10 = 32_000 J  → J/output = 32_000 / 1000 = 32.0
-      - total energy   = 80_000 J                  → J/total = 80_000 / 9000 ≈ 8.889
-    Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs."""
+      - prefill energy = 600 × 8 × 10 = 48_000 J  → J/input          = 6.0
+      - decode energy  = 400 × 8 × 10 = 32_000 J  → J/output_decode  = 32.0
+      - total energy   = 80_000 J                  → cluster J/output = 80.0
+                                                   → cluster J/total ≈ 8.889
+    Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs.
+    The per-stage decode attribution is exposed as
+    `joules_per_output_token_decode` so the cluster-wide
+    `joules_per_output_token` stays comparable across topologies."""
     base = 1_700_000_000.0
     _write_nvidia_csv(
         tmp_path / "perf_samples_prefill_w0_pn0.csv",
@@ -869,12 +876,19 @@ def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path):
     # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W.
     assert patched["avg_power_w"] == pytest.approx(500.0)
 
-    # Per-stage J/token: prefill energy / input, decode energy / output.
+    # Cluster-wide joules (total_system_energy / token_count) — same math as
+    # single-node so the metric stays comparable across topologies.
+    assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000)   # 80.0
+    assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)    # ≈ 8.889
+
+    # Per-stage scalars (new): prefill_avg, decode_avg, J/input, J/output_decode.
+    assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
+    assert patched["decode_avg_power_w"] == pytest.approx(400.0)
     assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0
-    assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000)  # 32.0
-    assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)   # ≈ 8.889
+    assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000)  # 32.0
 
-    workers = patched["power_by_worker"]
+    # workers[] (renamed from power_by_worker).
+    workers = patched["workers"]
     assert [w["role"] for w in workers] == ["prefill", "prefill", "decode"]
     assert [w["worker_idx"] for w in workers] == [0, 1, 0]
     # Decode_w0 collapsed across 2 hosts → 8 GPUs total.
@@ -890,11 +904,13 @@ def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path):
 
 
 def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path):
-    """A frontend-only node's power must not contribute to J/input or J/output.
+    """A frontend-only node's power must not contribute to per-stage scalars.
 
     Frontend nodes don't run any backend worker — their (typically near-idle)
     GPU draw would skew per-stage attribution if counted. They still appear
-    in power_by_worker for observability."""
+    in workers[] for observability, and they DO contribute to the cluster-wide
+    avg_power_w / joules_per_*_token totals (which describe the whole
+    deployment's energy)."""
     base = 1_700_000_000.0
     # Prefill worker — 4 GPUs @ 600W → 24_000 J in 10s
     _write_nvidia_csv(
@@ -906,7 +922,8 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path):
         tmp_path / "perf_samples_decode_w0_dn0.csv",
         [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)],
     )
-    # Frontend node — would erroneously add 4_000 J if counted.
+    # Frontend node — would erroneously bleed into per-stage scalars if counted,
+    # but DOES count toward cluster avg/joules (it's still energy consumed).
     _write_nvidia_csv(
         tmp_path / "perf_samples_frontend_w0_head.csv",
         [(base + 1 + s, gpu, 100.0) for s in range(8) for gpu in range(4)],
@@ -922,21 +939,32 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path):
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
 
-    # J/input = 24_000 / 8000 = 3.0 (frontend excluded).
+    # Per-stage scalars (frontend excluded).
+    # J/input = 24_000 / 8000 = 3.0.
     assert patched["joules_per_input_token"] == pytest.approx(3.0)
-    # J/output = 16_000 / 1000 = 16.0 (frontend excluded).
-    assert patched["joules_per_output_token"] == pytest.approx(16.0)
+    # J/output_decode = 16_000 / 1000 = 16.0.
+    assert patched["joules_per_output_token_decode"] == pytest.approx(16.0)
+    assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
+    assert patched["decode_avg_power_w"] == pytest.approx(400.0)
+
+    # Cluster-wide J/output still uses TOTAL energy (incl. frontend).
+    # total energy = (600+400+100) × 4 × 10 = 44_000 J → 44.0 J/output_tok.
+    assert patched["joules_per_output_token"] == pytest.approx(44.0)
+
     # Frontend still appears in the worker list for observability.
-    roles = [w["role"] for w in patched["power_by_worker"]]
+    roles = [w["role"] for w in patched["workers"]]
     assert "frontend" in roles
 
 
-def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path):
+def test_run_non_disagg_omits_per_stage_scalars(tmp_path: Path):
     """Non-disagg runs (single-node or multinode-agg) keep the legacy schema.
 
-    No joules_per_input_token field — it'd be meaningless without a prefill
-    stage to attribute energy to. Existing fields must keep their pre-disagg
-    semantics (total_system_energy / token_count)."""
+    No per-stage scalars (prefill_avg_power_w / decode_avg_power_w /
+    joules_per_input_token / joules_per_output_token_decode) and no workers[]
+    field — all of those need disagg + role-labeled CSVs to be meaningful.
+
+    Existing fields must keep their pre-disagg semantics
+    (total_system_energy / token_count)."""
     base = 1_700_000_000.0
     csv = tmp_path / "gpu_metrics.csv"
     _write_nvidia_csv(
@@ -951,8 +979,15 @@ def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path):
 
     assert run(csv, bench, agg, disagg=False) == 0
     patched = json.loads(agg.read_text())
-    assert "joules_per_input_token" not in patched
-    assert "power_by_worker" not in patched
+    for absent in (
+        "joules_per_input_token",
+        "joules_per_output_token_decode",
+        "prefill_avg_power_w",
+        "decode_avg_power_w",
+        "workers",
+        "power_by_worker",  # the old name must NOT leak through either
+    ):
+        assert absent not in patched, f"unexpected key {absent} in non-disagg output"
     # Legacy semantics: total energy / token count.
     assert patched["joules_per_output_token"] == pytest.approx(2.0)
     assert patched["joules_per_total_token"] == pytest.approx(2.0)
@@ -960,8 +995,8 @@ def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path):
 
 def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path: Path):
     """If only prefill or only decode CSVs survived, per-stage attribution
-    isn't possible — must fall back to cluster-wide ratios so the run still
-    publishes something useful instead of dropping the field entirely."""
+    isn't possible — the per-stage scalars are omitted but cluster-wide ratios
+    are still published so the run isn't telemetry-blank."""
     base = 1_700_000_000.0
     # Only prefill CSVs — decode is missing entirely.
     _write_nvidia_csv(
@@ -977,17 +1012,24 @@ def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path:
 
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
-    # power_by_worker still emitted (one prefill worker).
-    assert len(patched["power_by_worker"]) == 1
-    # J/input absent (no per-stage attribution possible).
-    assert "joules_per_input_token" not in patched
-    # J/output falls back to cluster-wide (total_energy / output_tokens).
+    # workers[] still emitted (one prefill worker, useful for observability).
+    assert len(patched["workers"]) == 1
+    # Per-stage scalars absent (no decode stage to attribute to).
+    for absent in (
+        "joules_per_input_token",
+        "joules_per_output_token_decode",
+        "prefill_avg_power_w",
+        "decode_avg_power_w",
+    ):
+        assert absent not in patched, f"unexpected per-stage key {absent}"
+    # Cluster-wide J/output still emitted (total_energy / output_tokens).
     assert patched["joules_per_output_token"] == pytest.approx(24_000 / 1000)
 
 
 def test_run_disagg_handles_zero_input_tokens(tmp_path: Path):
     """total_input_tokens=0 (rare degenerate case) → joules_per_input_token
-    omitted, no ZeroDivisionError."""
+    omitted, no ZeroDivisionError. Per-stage decode + per-stage power scalars
+    still emitted (those don't depend on input tokens)."""
     base = 1_700_000_000.0
     _write_nvidia_csv(
         tmp_path / "perf_samples_prefill_w0_pn0.csv",
@@ -1007,10 +1049,15 @@ def test_run_disagg_handles_zero_input_tokens(tmp_path: Path):
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
     assert "joules_per_input_token" not in patched
-    assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000)
+    # Per-stage decode still works — depends only on decode_energy / output.
+    assert patched["joules_per_output_token_decode"] == pytest.approx(16_000 / 1000)
+    assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
+    assert patched["decode_avg_power_w"] == pytest.approx(400.0)
+    # Cluster-wide J/output uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J.
+    assert patched["joules_per_output_token"] == pytest.approx(40_000 / 1000)
 
 
-def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path):
+def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path):
     """patch_agg_result emits the new optional fields when supplied."""
     agg = tmp_path / "agg.json"
     agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
@@ -1021,15 +1068,24 @@ def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path):
     patch_agg_result(
         agg,
         avg_power_w=500.0,
-        joules_per_output_token=16.0,
+        joules_per_output_token=40.0,
         joules_per_total_token=4.44,
         joules_per_input_token=3.0,
-        power_by_worker=workers,
+        joules_per_output_token_decode=16.0,
+        prefill_avg_power_w=600.0,
+        decode_avg_power_w=400.0,
+        workers=workers,
     )
     data = json.loads(agg.read_text())
     assert data["avg_power_w"] == 500.0
+    assert data["joules_per_output_token"] == 40.0
     assert data["joules_per_input_token"] == 3.0
-    assert data["power_by_worker"] == workers
+    assert data["joules_per_output_token_decode"] == 16.0
+    assert data["prefill_avg_power_w"] == 600.0
+    assert data["decode_avg_power_w"] == 400.0
+    assert data["workers"] == workers
+    # power_by_worker (old name) must NOT appear.
+    assert "power_by_worker" not in data
 
 
 def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path):
@@ -1043,5 +1099,550 @@ def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path):
         joules_per_total_token=0.5,
     )
     data = json.loads(agg.read_text())
-    assert "joules_per_input_token" not in data
-    assert "power_by_worker" not in data
+    for absent in (
+        "joules_per_input_token",
+        "joules_per_output_token_decode",
+        "prefill_avg_power_w",
+        "decode_avg_power_w",
+        "avg_temp_c",
+        "peak_temp_c",
+        "avg_util_pct",
+        "avg_mem_used_mb",
+        "workers",
+        "power_by_worker",
+    ):
+        assert absent not in data, f"unexpected key {absent} in minimal patch"
+
+
+# --------------------------------------------------------------------------- #
+# Telemetry: temperature, utilization, memory
+#
+# These extend aggregate_metrics()'s capability beyond power. Frontend already
+# wires avg_temp_c / avg_util_pct / avg_mem_used_mb / peak_temp_c as scalar
+# numerics (same convention as avg_power_w: per-GPU mean, unit-suffixed name).
+# Power remains required for aggregation to fire; the others degrade gracefully.
+# --------------------------------------------------------------------------- #
+
+
+def _write_csv_with_metrics(
+    path: Path,
+    samples: list[tuple[float, int, dict[str, float]]],
+    *,
+    columns: tuple[str, ...] = ("power.draw [W]", "temperature.gpu", "utilization.gpu", "memory.used [MiB]"),
+    column_map: dict[str, str] | None = None,
+) -> None:
+    """Write a CSV with arbitrary metric columns.
+
+    samples: list of (epoch_seconds, gpu_index, {metric_key: value}). The
+    metric_key in the dict must match one of: 'power', 'temp', 'util', 'mem'.
+    The columns parameter is the literal CSV header for those metrics, in order.
+    column_map maps each metric_key → its position in `columns` (default: assume
+    same order as ('power', 'temp', 'util', 'mem') for an NVIDIA-style header).
+    """
+    if column_map is None:
+        column_map = {"power": columns[0], "temp": columns[1], "util": columns[2], "mem": columns[3]}
+    header = "timestamp, index, " + ", ".join(columns)
+    lines = [header]
+    for ts, idx, vals in samples:
+        row = [_nvidia_ts(ts), str(idx)]
+        for col in columns:
+            metric_key = next((k for k, v in column_map.items() if v == col), None)
+            v = vals.get(metric_key)
+            if v is None:
+                row.append("[N/A]")
+            elif col == columns[0]:  # power
+                row.append(f"{v:.2f} W")
+            elif "temp" in col.lower():
+                row.append(f"{int(v)} C")
+            elif "util" in col.lower():
+                row.append(f"{int(v)} %")
+            elif "mem" in col.lower():
+                row.append(f"{int(v)} MiB")
+            else:
+                row.append(str(v))
+        lines.append(", ".join(row))
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+
+def test_detect_all_columns_nvidia():
+    """NVIDIA header has all four metrics — each maps to its canonical column."""
+    header = ["timestamp", "index", "power.draw [W]", "temperature.gpu",
+              "utilization.gpu", "memory.used [MiB]"]
+    cols = _detect_all_columns(header)
+    assert cols["timestamp"] == "timestamp"
+    assert cols["gpu"] == "index"
+    assert cols["power"] == "power.draw [W]"
+    assert cols["temp"] == "temperature.gpu"
+    assert cols["util"] == "utilization.gpu"
+    assert cols["mem"] == "memory.used [MiB]"
+
+
+def test_detect_all_columns_srt_slurm_style():
+    """srt-slurm perfmon uses bare-name columns: power_w, temp_c, util_pct, mem_used_mb."""
+    header = ["timestamp", "gpu", "power_w", "temp_c", "util_pct", "mem_used_mb"]
+    cols = _detect_all_columns(header)
+    assert cols["power"] == "power_w"
+    assert cols["temp"] == "temp_c"
+    assert cols["util"] == "util_pct"
+    assert cols["mem"] == "mem_used_mb"
+
+
+def test_detect_all_columns_amd_style():
+    """AMD amd-smi uses different conventions: socket_power, temperature."""
+    header = ["timestamp", "gpu", "socket_power", "temperature"]
+    cols = _detect_all_columns(header)
+    assert cols["power"] == "socket_power"
+    assert cols["temp"] == "temperature"
+    # No util/mem in this header — gracefully None.
+    assert cols["util"] is None
+    assert cols["mem"] is None
+
+
+def test_detect_all_columns_excludes_memory_total():
+    """memory.total must not be picked as the memory column (we want USED memory)."""
+    header = ["timestamp", "index", "power.draw [W]", "memory.total [MiB]", "memory.used [MiB]"]
+    cols = _detect_all_columns(header)
+    assert cols["mem"] == "memory.used [MiB]"
+
+
+def test_detect_all_columns_missing_optional_metrics():
+    """Only power present — temp/util/mem all None."""
+    header = ["timestamp", "index", "power.draw [W]"]
+    cols = _detect_all_columns(header)
+    assert cols["power"] == "power.draw [W]"
+    assert cols["temp"] is None
+    assert cols["util"] is None
+    assert cols["mem"] is None
+
+
+def test_aggregate_metrics_returns_all_telemetry_single_node(tmp_path: Path):
+    """Cluster-wide aggregation captures power, temp, util, mem in one pass."""
+    csv = tmp_path / "gpu_metrics.csv"
+    base = 1_700_000_000.0
+    # 4 GPUs, 3 samples — uniform values per metric.
+    samples = []
+    for s in range(3):
+        for gpu in range(4):
+            samples.append(
+                (base + s, gpu, {"power": 500.0, "temp": 70.0, "util": 95.0, "mem": 60000.0})
+            )
+    _write_csv_with_metrics(csv, samples)
+    result = aggregate_metrics(csv, base, base + 10)
+    assert result is not None
+    assert result["num_gpus"] == 4
+    assert result["power"] == pytest.approx(500.0)
+    assert result["temp"] == pytest.approx(70.0)
+    assert result["util"] == pytest.approx(95.0)
+    assert result["mem"] == pytest.approx(60000.0)
+    assert result["peak_temp"] == pytest.approx(70.0)  # uniform → peak == avg
+
+
+def test_aggregate_metrics_peak_temp_is_max_not_mean(tmp_path: Path):
+    """peak_temp_c is the global max instantaneous reading, not a per-bucket mean.
+
+    Critical for thermal-headroom signals: a single GPU hitting 85C during the
+    run matters even if the cluster mean stays at 70C."""
+    csv = tmp_path / "gpu_metrics.csv"
+    base = 1_700_000_000.0
+    samples = []
+    # 4 GPUs at 70C steadily, EXCEPT one GPU spikes to 85C in the middle sample.
+    for s in range(3):
+        for gpu in range(4):
+            temp = 85.0 if (s == 1 and gpu == 2) else 70.0
+            samples.append((base + s, gpu, {"power": 500.0, "temp": temp}))
+    _write_csv_with_metrics(
+        csv, samples,
+        columns=("power.draw [W]", "temperature.gpu"),
+        column_map={"power": "power.draw [W]", "temp": "temperature.gpu"},
+    )
+    result = aggregate_metrics(csv, base, base + 10)
+    assert result is not None
+    # Mean is dominated by the 11 readings at 70 + 1 at 85 = (11*70 + 85)/12 ≈ 71.25.
+    assert result["temp"] == pytest.approx((11 * 70 + 85) / 12, abs=0.01)
+    # Peak is the raw max sample, not any averaged value.
+    assert result["peak_temp"] == pytest.approx(85.0)
+
+
+def test_aggregate_metrics_missing_temp_column_omits_temp(tmp_path: Path):
+    """A CSV without a temp column → result dict has no 'temp' / 'peak_temp' keys.
+
+    Graceful degradation: callers using .get() / 'temp' in result handle this
+    naturally."""
+    csv = tmp_path / "gpu_metrics.csv"
+    base = 1_700_000_000.0
+    # Header has ONLY power.
+    samples = [(base + s, gpu, {"power": 500.0}) for s in range(3) for gpu in range(4)]
+    _write_csv_with_metrics(
+        csv, samples,
+        columns=("power.draw [W]",),
+        column_map={"power": "power.draw [W]"},
+    )
+    result = aggregate_metrics(csv, base, base + 10)
+    assert result is not None
+    assert result["power"] == pytest.approx(500.0)
+    assert "temp" not in result
+    assert "peak_temp" not in result
+    assert "util" not in result
+    assert "mem" not in result
+
+
+def test_aggregate_metrics_missing_util_only_keeps_others(tmp_path: Path):
+    """Power + temp + mem present but no util column → util omitted, rest fine.
+
+    Mirrors the AMD case where amd-smi output may lack a utilization column."""
+    csv = tmp_path / "gpu_metrics.csv"
+    base = 1_700_000_000.0
+    samples = [
+        (base + s, gpu, {"power": 500.0, "temp": 70.0, "mem": 60000.0})
+        for s in range(3) for gpu in range(4)
+    ]
+    _write_csv_with_metrics(
+        csv, samples,
+        columns=("power.draw [W]", "temperature.gpu", "memory.used [MiB]"),
+        column_map={"power": "power.draw [W]", "temp": "temperature.gpu", "mem": "memory.used [MiB]"},
+    )
+    result = aggregate_metrics(csv, base, base + 10)
+    assert result is not None
+    assert "util" not in result
+    assert result["temp"] == pytest.approx(70.0)
+    assert result["mem"] == pytest.approx(60000.0)
+
+
+def test_aggregate_metrics_multinode_aggregates_across_csvs(tmp_path: Path):
+    """Multinode telemetry rolls up across per-node CSVs same as power.
+
+    Per-GPU mean is weighted by the (per-sample, per-namespace) GPU count."""
+    base = 1_700_000_000.0
+    node1 = tmp_path / "perf_samples_node1.csv"
+    node2 = tmp_path / "perf_samples_node2.csv"
+    _write_csv_with_metrics(
+        node1,
+        [(base + s, gpu, {"power": 600.0, "temp": 75.0, "util": 95.0, "mem": 60000.0})
+         for s in range(3) for gpu in range(4)],
+    )
+    _write_csv_with_metrics(
+        node2,
+        [(base + s, gpu, {"power": 400.0, "temp": 65.0, "util": 85.0, "mem": 40000.0})
+         for s in range(3) for gpu in range(4)],
+    )
+    result = aggregate_metrics([node1, node2], base, base + 10)
+    assert result is not None
+    assert result["num_gpus"] == 8
+    # All metrics are weighted means across the 8 distinct GPUs.
+    assert result["power"] == pytest.approx(500.0)  # (600+400)/2
+    assert result["temp"] == pytest.approx(70.0)    # (75+65)/2
+    assert result["util"] == pytest.approx(90.0)
+    assert result["mem"] == pytest.approx(50000.0)
+    assert result["peak_temp"] == pytest.approx(75.0)
+
+
+def test_run_patches_cluster_wide_temp_util_mem(tmp_path: Path):
+    """End-to-end: run() patches cluster-wide telemetry into the agg JSON
+    when the CSV exposes the corresponding columns."""
+    base = 1_700_000_000.0
+    csv = tmp_path / "gpu_metrics.csv"
+    samples = [
+        (base + 1 + s, gpu, {"power": 500.0, "temp": 70.0, "util": 95.0, "mem": 60000.0})
+        for s in range(2) for gpu in range(8)
+    ]
+    _write_csv_with_metrics(csv, samples)
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000)
+    agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8")
+
+    assert run(csv, bench, agg) == 0
+    patched = json.loads(agg.read_text())
+    # Power baseline still works.
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+    # New cluster-wide scalars present and rounded to 3 decimals.
+    assert patched["avg_temp_c"] == pytest.approx(70.0)
+    assert patched["peak_temp_c"] == pytest.approx(70.0)
+    assert patched["avg_util_pct"] == pytest.approx(95.0)
+    assert patched["avg_mem_used_mb"] == pytest.approx(60000.0)
+
+
+def test_run_omits_cluster_telemetry_when_csv_has_no_extra_columns(tmp_path: Path):
+    """Power-only CSV → only avg_power_w + joules_per_*_token are emitted.
+
+    Backward compat with old CSVs / older monitoring setups that only captured
+    power. The agg JSON must not gain spurious null/zero values for the
+    metrics the CSV didn't carry."""
+    base = 1_700_000_000.0
+    csv = tmp_path / "gpu_metrics.csv"
+    # Old NVIDIA CSV without temp/util/mem — the _write_nvidia_csv helper
+    # already includes temperature though. So use the metric helper with only power.
+    samples = [(base + 1 + s, gpu, {"power": 500.0}) for s in range(2) for gpu in range(8)]
+    _write_csv_with_metrics(
+        csv, samples,
+        columns=("power.draw [W]",),
+        column_map={"power": "power.draw [W]"},
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000)
+    agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8")
+
+    assert run(csv, bench, agg) == 0
+    patched = json.loads(agg.read_text())
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+    for absent in ("avg_temp_c", "peak_temp_c", "avg_util_pct", "avg_mem_used_mb"):
+        assert absent not in patched, f"unexpected {absent} when CSV lacks that column"
+
+
+def test_run_disagg_emits_per_worker_temp_util_mem(tmp_path: Path):
+    """Disagg multinode: each entry in workers[] carries per-worker telemetry
+    in addition to avg_power_w. Frontend can render thermal/util breakdown
+    by worker role."""
+    base = 1_700_000_000.0
+    # Prefill worker runs hotter (compute-bound) than decode (memory-bound).
+    _write_csv_with_metrics(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, {"power": 600.0, "temp": 80.0, "util": 98.0, "mem": 50000.0})
+         for s in range(8) for gpu in range(4)],
+    )
+    _write_csv_with_metrics(
+        tmp_path / "perf_samples_decode_w0_dn0.csv",
+        [(base + 1 + s, gpu, {"power": 400.0, "temp": 65.0, "util": 70.0, "mem": 70000.0})
+         for s in range(8) for gpu in range(4)],
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0,
+        total_output=1000, total_input=8000,
+    )
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+
+    # Cluster-wide telemetry: weighted mean across all 8 GPUs.
+    assert patched["avg_temp_c"] == pytest.approx(72.5)        # (80+65)/2
+    assert patched["peak_temp_c"] == pytest.approx(80.0)
+    assert patched["avg_util_pct"] == pytest.approx(84.0)      # (98+70)/2
+    assert patched["avg_mem_used_mb"] == pytest.approx(60000.0)
+
+    workers = patched["workers"]
+    prefill = next(w for w in workers if w["role"] == "prefill")
+    decode = next(w for w in workers if w["role"] == "decode")
+    # Per-worker fields present alongside avg_power_w.
+    assert prefill["avg_temp_c"] == pytest.approx(80.0)
+    assert prefill["peak_temp_c"] == pytest.approx(80.0)
+    assert prefill["avg_util_pct"] == pytest.approx(98.0)
+    assert prefill["avg_mem_used_mb"] == pytest.approx(50000.0)
+    assert decode["avg_temp_c"] == pytest.approx(65.0)
+    assert decode["avg_util_pct"] == pytest.approx(70.0)
+    assert decode["avg_mem_used_mb"] == pytest.approx(70000.0)
+
+
+def test_run_per_worker_omits_missing_telemetry_columns(tmp_path: Path):
+    """If a worker's CSV lacks a temp/util/mem column, those keys are
+    omitted from that worker's entry — no nulls leak through."""
+    base = 1_700_000_000.0
+    # Prefill: full schema (power + temp + util + mem).
+    _write_csv_with_metrics(
+        tmp_path / "perf_samples_prefill_w0_pn0.csv",
+        [(base + 1 + s, gpu, {"power": 600.0, "temp": 80.0, "util": 98.0, "mem": 50000.0})
+         for s in range(8) for gpu in range(4)],
+    )
+    # Decode: power only — no other columns at all in its CSV.
+    _write_csv_with_metrics(
+        tmp_path / "perf_samples_decode_w0_dn0.csv",
+        [(base + 1 + s, gpu, {"power": 400.0}) for s in range(8) for gpu in range(4)],
+        columns=("power.draw [W]",),
+        column_map={"power": "power.draw [W]"},
+    )
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0,
+        total_output=1000, total_input=8000,
+    )
+    agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+    workers = patched["workers"]
+    decode = next(w for w in workers if w["role"] == "decode")
+    # Decode worker has avg_power_w but none of the optional telemetry fields.
+    assert decode["avg_power_w"] == pytest.approx(400.0)
+    for absent in ("avg_temp_c", "peak_temp_c", "avg_util_pct", "avg_mem_used_mb"):
+        assert absent not in decode, f"unexpected {absent} on power-only decode worker"
+    # Prefill still has all of them.
+    prefill = next(w for w in workers if w["role"] == "prefill")
+    assert "avg_temp_c" in prefill
+    assert "avg_util_pct" in prefill
+    assert "avg_mem_used_mb" in prefill
+
+
+# --------------------------------------------------------------------------- #
+# AMD multi-node disaggregated inference (mi355x)
+#
+# The AMD path has no srt-slurm orchestrator: each SGLang/vLLM disagg node
+# starts its own amd-smi monitor via start_perf_monitor (benchmarks/
+# benchmark_lib.sh), writing perf_samples_<role>_w<idx>_<host>.csv in the SAME
+# convention as the NVIDIA perfmon. These tests lock in that the (vendor-
+# agnostic) aggregation produces the full per-worker / per-stage schema when
+# fed amd-smi CSVs — ISO timestamps, bare-numeric power, "gpu"/"socket_power"
+# columns — over realistic MI355X (8 GPUs/node) disagg topologies and AMD
+# cluster hostnames. The NVIDIA-CSV tests above already cover the math; these
+# guard the AMD CSV format + filename round-trip end to end.
+# --------------------------------------------------------------------------- #
+
+
+def test_parse_perfmon_label_amd_hostname():
+    """AMD mi355x cluster hostnames (e.g. mia1-p01-g09) round-trip cleanly.
+
+    start_perf_monitor builds the filename from `hostname -s` sanitized with
+    `tr -c 'A-Za-z0-9.-' '_'`; AMD short hostnames are already alnum+dash, so
+    the host segment survives intact through _parse_perfmon_label."""
+    assert _parse_perfmon_label(
+        Path("perf_samples_prefill_w0_mia1-p01-g09.csv")
+    ) == ("prefill", 0, "mia1-p01-g09")
+    assert _parse_perfmon_label(
+        Path("perf_samples_decode_w2_smci355-ccs-aus-12.csv")
+    ) == ("decode", 2, "smci355-ccs-aus-12")
+
+
+def test_aggregate_power_by_worker_amd_one_csv_per_worker(tmp_path: Path):
+    """AMD amd-smi CSVs, one prefill + one decode worker, 8 GPUs/node (MI355X).
+
+    Same grouping logic as the NVIDIA case, but proves the amd-smi CSV schema
+    (ISO timestamp, bare power, 'gpu' index col) parses through the per-worker
+    rollup."""
+    base = 1_700_000_000.0
+    _write_amd_csv(
+        tmp_path / "perf_samples_prefill_w0_mia1-p01-g01.csv",
+        [(base + s, gpu, 600.0) for s in range(3) for gpu in range(8)],
+    )
+    _write_amd_csv(
+        tmp_path / "perf_samples_decode_w0_mia1-p01-g02.csv",
+        [(base + s, gpu, 400.0) for s in range(3) for gpu in range(8)],
+    )
+
+    workers = aggregate_power_by_worker(
+        list(tmp_path.glob("perf_samples_*.csv")), base, base + 10
+    )
+    assert workers is not None
+    assert [w["role"] for w in workers] == ["prefill", "decode"]
+    assert [w["worker_idx"] for w in workers] == [0, 0]
+    assert workers[0]["num_gpus"] == 8
+    assert workers[0]["avg_power_w"] == pytest.approx(600.0)
+    assert workers[0]["hosts"] == ["mia1-p01-g01"]
+    assert workers[1]["num_gpus"] == 8
+    assert workers[1]["avg_power_w"] == pytest.approx(400.0)
+
+
+def test_aggregate_power_by_worker_amd_worker_spans_multiple_nodes(tmp_path: Path):
+    """A single decode worker spanning 2 MI355X nodes (DECODE_TP_SIZE=16).
+
+    Both node-CSVs share (decode, w0); amd-smi reports local indices 0..7 on
+    each, so without per-source namespacing the union would collapse to 8
+    instead of 16. Mirrors the SGLang DECODE_NODES_PER_WORKER>1 topology."""
+    base = 1_700_000_000.0
+    hosts = ["mia1-p01-g05", "mia1-p01-g06"]
+    for h in hosts:
+        _write_amd_csv(
+            tmp_path / f"perf_samples_decode_w0_{h}.csv",
+            [(base + s, gpu, 400.0) for s in range(3) for gpu in range(8)],
+        )
+
+    workers = aggregate_power_by_worker(
+        list(tmp_path.glob("perf_samples_*.csv")), base, base + 10
+    )
+    assert workers is not None
+    assert len(workers) == 1
+    w = workers[0]
+    assert w["role"] == "decode"
+    assert w["worker_idx"] == 0
+    assert w["num_gpus"] == 16  # 2 nodes × 8 GPUs
+    assert w["avg_power_w"] == pytest.approx(400.0)
+    assert w["hosts"] == sorted(hosts)
+
+
+def test_run_disagg_amd_emits_workers_and_per_stage_joules(tmp_path: Path):
+    """Full AMD mi355x disagg pipeline end to end with amd-smi CSVs.
+
+    Topology: 1 prefill worker × 8 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W.
+    Over a 10s window with 8000 input + 1000 output tokens:
+      - prefill energy = 600 × 8 × 10 = 48_000 J  → J/input         = 6.0
+      - decode energy  = 400 × 8 × 10 = 32_000 J  → J/output_decode = 32.0
+      - total energy   = 80_000 J                  → cluster J/output = 80.0
+      - cluster avg    = (8×600 + 8×400)/16 = 500W
+    This is the AMD analogue of test_run_disagg_emits_workers_and_per_stage_joules."""
+    base = 1_700_000_000.0
+    _write_amd_csv(
+        tmp_path / "perf_samples_prefill_w0_mia1-p01-g01.csv",
+        [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(8)],
+    )
+    _write_amd_csv(
+        tmp_path / "perf_samples_decode_w0_mia1-p01-g02.csv",
+        [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(8)],
+    )
+
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0,
+        total_output=1000, total_input=8000,
+    )
+    agg.write_text(json.dumps({"hw": "mi355x", "disagg": True}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+
+    # Cluster-wide (vendor-agnostic, same math as single-node / NVIDIA).
+    assert patched["avg_power_w"] == pytest.approx(500.0)
+    assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000)  # 80.0
+    assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)   # ≈ 8.889
+
+    # Per-stage scalars from amd-smi CSVs.
+    assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
+    assert patched["decode_avg_power_w"] == pytest.approx(400.0)
+    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0
+    assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000)  # 32.0
+
+    # workers[] breakdown.
+    workers = patched["workers"]
+    assert [w["role"] for w in workers] == ["prefill", "decode"]
+    assert all(w["num_gpus"] == 8 for w in workers)
+
+
+def test_run_disagg_amd_vllm_topology_one_worker_per_node(tmp_path: Path):
+    """vLLM AMD topology: xP=2 prefill + yD=2 decode, one worker per node.
+
+    server_vllm.sh labels ranks [0,xP) prefill (w=rank) and [xP, xP+yD) decode
+    (w=rank-xP). Four amd-smi CSVs, distinct worker indices per stage."""
+    base = 1_700_000_000.0
+    for w in range(2):
+        _write_amd_csv(
+            tmp_path / f"perf_samples_prefill_w{w}_mia1-p02-g0{w}.csv",
+            [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(8)],
+        )
+    for w in range(2):
+        _write_amd_csv(
+            tmp_path / f"perf_samples_decode_w{w}_mia1-p02-g1{w}.csv",
+            [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(8)],
+        )
+
+    bench = tmp_path / "bench.json"
+    agg = tmp_path / "agg.json"
+    _write_bench_result(
+        bench, start=base, end=base + 10, duration=10.0,
+        total_output=1000, total_input=8000,
+    )
+    agg.write_text(json.dumps({"hw": "mi355x", "disagg": True}), encoding="utf-8")
+
+    assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
+    patched = json.loads(agg.read_text())
+
+    workers = patched["workers"]
+    assert [w["role"] for w in workers] == ["prefill", "prefill", "decode", "decode"]
+    assert [w["worker_idx"] for w in workers] == [0, 1, 0, 1]
+    # 2 prefill workers × 8 GPUs @ 600W → 96_000 J / 8000 input = 12.0.
+    assert patched["joules_per_input_token"] == pytest.approx(96_000 / 8000)
+    # 2 decode workers × 8 GPUs @ 400W → 64_000 J / 1000 output = 64.0.
+    assert patched["joules_per_output_token_decode"] == pytest.approx(64_000 / 1000)
+    assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
+    assert patched["decode_avg_power_w"] == pytest.approx(400.0)
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index 6b3fc9a94..ad931591b 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -755,14 +755,14 @@ def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, s
         patched = json.loads(agg_path.read_text())
         assert "avg_power_w" not in patched
 
-    def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path, multinode_env_vars):
+    def test_disagg_multinode_emits_workers_and_per_stage_joules(self, tmp_path, multinode_env_vars):
         """End-to-end disagg wiring: DISAGG=true + per-node labeled CSVs →
         process_result.py passes disagg through to aggregate_power, which emits
-        power_by_worker + joules_per_input_token using per-stage attribution.
+        workers[] + per-stage scalars alongside the cluster-wide joules.
 
-        Without the disagg=disagg propagation in process_result.py, the run
-        would silently fall back to cluster-wide joules math and the user-facing
-        per-stage J/input metric would be missing."""
+        Without the disagg=disagg propagation in process_result.py, the
+        per-stage scalars (joules_per_input_token, joules_per_output_token_decode,
+        prefill_avg_power_w, decode_avg_power_w) would be missing."""
         start, end = 1_700_000_100.0, 1_700_000_160.0  # 60s bench window
         # 1 prefill worker × 4 GPUs @ 600W on its own node
         self._write_nvidia_csv(
@@ -796,21 +796,28 @@ def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path,
 
         patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text())
 
-        # Per-stage attribution: prefill_energy / input, decode_energy / output.
+        # Per-stage attribution scalars: prefill_energy / input, decode_energy / output.
         # Prefill: 600 × 4 × 60 = 144_000 J  → / 240_000 = 0.6 J/input_tok.
-        # Decode:  400 × 4 × 60 =  96_000 J  → /  30_000 = 3.2 J/output_tok.
+        # Decode:  400 × 4 × 60 =  96_000 J  → /  30_000 = 3.2 J/output_tok_decode.
         assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01)
-        assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01)
+        assert patched["joules_per_output_token_decode"] == pytest.approx(3.2, abs=0.01)
+        assert patched["prefill_avg_power_w"] == pytest.approx(600.0, abs=0.5)
+        assert patched["decode_avg_power_w"] == pytest.approx(400.0, abs=0.5)
+
+        # Cluster-wide J/output (frontend would be incl. here too if present).
+        # Total energy = (600+400) × 4 × 60 = 240_000 J → / 30_000 = 8.0 J/output_tok.
+        assert patched["joules_per_output_token"] == pytest.approx(8.0, abs=0.05)
 
         # Per-worker breakdown labeled with role.
-        workers = patched["power_by_worker"]
+        workers = patched["workers"]
         assert {w["role"] for w in workers} == {"prefill", "decode"}
         for w in workers:
             assert w["num_gpus"] == 4
             assert w["worker_idx"] == 0
 
     def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, multinode_env_vars):
-        """Multinode but DISAGG=false → keep cluster-wide ratios, no J/input.
+        """Multinode but DISAGG=false → keep cluster-wide ratios, no per-stage
+        scalars.
 
         Sanity check that the disagg flag is the gate, not just multinode-ness."""
         start, end = 1_700_000_100.0, 1_700_000_160.0
@@ -841,7 +848,13 @@ def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, mul
         assert result.returncode == 0, f"Script failed: {result.stderr}"
 
         patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text())
-        assert "joules_per_input_token" not in patched
-        # power_by_worker still emitted (filename labels exist) — useful for
+        for absent in (
+            "joules_per_input_token",
+            "joules_per_output_token_decode",
+            "prefill_avg_power_w",
+            "decode_avg_power_w",
+        ):
+            assert absent not in patched, f"unexpected per-stage key {absent}"
+        # workers[] still emitted (filename labels exist) — useful for
         # observability even on non-disagg runs.
-        assert patched["power_by_worker"][0]["role"] == "agg"
+        assert patched["workers"][0]["role"] == "agg"

From f407f4b4d653a517db91da5c3607a5016dfae7d5 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 13:30:59 -0700
Subject: [PATCH 09/14] feat(power): AMD multi-node measured-power telemetry
 (mi355x disagg)

Mirror the NVIDIA gb300/srt-slurm measured-power path on the AMD
multi-node disaggregated inference path. With no orchestrator perfmon,
each SGLang/vLLM disagg node starts its own amd-smi monitor via
start_perf_monitor (benchmark_lib.sh), writing
perf_samples_<role>_w<idx>_<host>.csv into the NFS-shared
/benchmark_logs/perfmon mount; launch_mi355x-amds.sh collects them and
exports GPU_METRICS_CSV_GLOB so the existing vendor-agnostic
utils/aggregate_power.py produces per-worker + per-stage power.

AMD perfmon wiring:
- benchmark_lib.sh: start_perf_monitor helper; case-insensitive amd-smi
  header filter; log captured CSV header for schema-mismatch visibility
- amd_utils/job.slurm: PERFMON_OUTPUT_DIR + interval into each container
- amd_utils/server_sglang.sh / server_vllm.sh: per-node role + worker-idx
  classification (matches each engine's own placement); monitor start +
  stop on every exit path
- runners/launch_mi355x-amds.sh: collect per-node CSVs immediately after
  job completion (before result-processing early-exits / EXIT-trap wipe),
  export GPU_METRICS_CSV_GLOB
- utils/aggregate_power.py: docstring documents the AMD source (logic
  already vendor-agnostic)
- utils/test_aggregate_power.py: AMD amd-smi multinode tests (per-worker,
  per-stage J/token, multi-node-per-worker collapse, vLLM topology)
- perf-changelog.yaml: trigger the 6 mi355x disagg sweeps (sglang+vllm)

Also lands the concurrent per-metric telemetry extension in
aggregate_power.py / tests: temp/util/mem aggregation, workers[] schema,
and flat per-stage scalars (prefill_avg_power_w, decode_avg_power_w,
joules_per_input_token, joules_per_output_token_decode).

Verified locally: 107 utils tests pass; bash syntax + shellcheck clean;
role mapping + filename contract + full amd-smi->agg pipeline validated;
adversarial review findings addressed (CSV collection moved ahead of
early exits; case-insensitive amd-smi header).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh                   | 78 ++++++++++++++++++-
 benchmarks/multi_node/amd_utils/job.slurm     | 12 +++
 .../multi_node/amd_utils/server_sglang.sh     | 28 +++++++
 .../multi_node/amd_utils/server_vllm.sh       | 26 +++++++
 runners/launch_mi355x-amds.sh                 | 38 +++++++++
 5 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 7dbbaaaa8..747b445c0 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -41,10 +41,18 @@ start_gpu_monitor() {
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
     elif command -v amd-smi &>/dev/null; then
-        # Use amd-smi native watch mode (-w) which includes timestamps automatically.
-        # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers.
+        # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage,
+        # -w <interval> native watch mode (emits a timestamp column per sample),
+        # --csv. The awk filter keeps the first CSV header line and drops
+        # amd-smi's preamble / repeated headers. Header match is case-insensitive
+        # (tolower) so a capitalized "Timestamp," header — should amd-smi ever
+        # emit one — still passes through; aggregate_power's column detection is
+        # case-insensitive too. NOTE: amd-smi timestamps are node-local wall
+        # clock, so multinode aggregation assumes cluster clocks are NTP-synced
+        # (same assumption as nvidia-smi; aggregate_power windows by absolute
+        # epoch from benchmark_serving.py).
         amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \
-            | awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
+            | awk 'tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
     else
@@ -63,11 +71,75 @@ stop_gpu_monitor() {
             local lines
             lines=$(wc -l < "$GPU_METRICS_CSV")
             echo "[GPU Monitor] Collected $lines rows -> $GPU_METRICS_CSV"
+            # Echo the captured header so a vendor-SMI schema mismatch (the one
+            # thing that silently yields 0 usable power samples downstream) is
+            # visible in CI logs without re-running on hardware.
+            echo "[GPU Monitor] CSV header: $(head -1 "$GPU_METRICS_CSV" 2>/dev/null)"
         fi
     fi
     GPU_MONITOR_PID=""
 }
 
+# Start a per-node GPU power monitor for multi-node disaggregated runs.
+#
+# This is the AMD/SGLang/vLLM analogue of NVIDIA srt-slurm's per-node perfmon
+# (PR #35): there is no orchestrator to spawn nvidia-smi on each node, so each
+# node starts its own amd-smi/nvidia-smi monitor here. The output filename
+# encodes the worker role and index in exactly the format
+# utils/aggregate_power.py's _parse_perfmon_label expects:
+#
+#     perf_samples_<role>_w<worker_idx>_<host>.csv
+#
+# so the downstream aggregation can attribute energy per worker and (for disagg)
+# per stage. role must be one of: prefill, decode, agg, frontend.
+#
+# Output goes to $PERFMON_OUTPUT_DIR, which job.slurm points at the NFS-shared
+# /benchmark_logs/perfmon mount so every node's CSV lands in one directory the
+# runner can collect. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's benchmark
+# load window using the timestamps benchmark_serving.py writes.
+#
+# Best-effort by design: an unset output dir, an unknown role, or a missing
+# amd-smi/nvidia-smi is a no-op that returns 0 — a monitoring hiccup must never
+# fail the benchmark.
+#
+# Usage: start_perf_monitor <role> <worker_idx> [interval_seconds]
+start_perf_monitor() {
+    local role="$1"
+    local worker_idx="$2"
+    local interval="${3:-${PERFMON_SAMPLE_INTERVAL:-1}}"
+
+    local out_dir="${PERFMON_OUTPUT_DIR:-}"
+    if [[ -z "$out_dir" ]]; then
+        echo "[perfmon] PERFMON_OUTPUT_DIR unset — skipping per-node power monitor"
+        return 0
+    fi
+    case "$role" in
+        prefill|decode|agg|frontend) ;;
+        *)
+            echo "[perfmon] unknown role '$role' (expected prefill|decode|agg|frontend) — skipping monitor"
+            return 0
+            ;;
+    esac
+    if ! mkdir -p "$out_dir" 2>/dev/null; then
+        echo "[perfmon] cannot create $out_dir — skipping per-node power monitor"
+        return 0
+    fi
+
+    # Sanitize the host component so the filename stays parseable by
+    # aggregate_power's regex (role/idx anchors are unambiguous, but keep the
+    # host free of separators that could confuse a future tightening). Prefer
+    # the short hostname; fall back to the FQDN.
+    local host
+    host=$(hostname -s 2>/dev/null || hostname)
+    host=$(printf '%s' "$host" | tr -c 'A-Za-z0-9.-' '_')
+
+    local out="${out_dir}/perf_samples_${role}_w${worker_idx}_${host}.csv"
+    echo "[perfmon] starting per-node power monitor: role=$role worker=$worker_idx host=$host interval=${interval}s -> $out"
+    start_gpu_monitor --output "$out" --interval "$interval"
+    return 0
+}
+
 # Check if required environment variables are set
 # Usage: check_env_vars VAR1 VAR2 VAR3 ...
 # Exits with code 1 if any variable is not set
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
index 5e8e67606..102953eb8 100755
--- a/benchmarks/multi_node/amd_utils/job.slurm
+++ b/benchmarks/multi_node/amd_utils/job.slurm
@@ -298,6 +298,16 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}"
 export ENGINE=$ENGINE
 
+# Per-node measured-power monitoring. Each node's server script starts an
+# amd-smi/nvidia-smi monitor (start_perf_monitor in benchmark_lib.sh) that
+# writes perf_samples_<role>_w<idx>_<host>.csv into PERFMON_OUTPUT_DIR. That
+# dir is the /benchmark_logs/perfmon mount, which maps to BENCHMARK_LOGS_DIR
+# on the (NFS-shared) host so every node's CSV lands in one place the runner
+# can collect. Pre-create it on the host so the directory exists before any
+# container writes to it.
+export PERFMON_SAMPLE_INTERVAL="${PERFMON_SAMPLE_INTERVAL:-1}"
+mkdir -p "${BENCHMARK_LOGS_DIR}/perfmon" 2>/dev/null || true
+
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
@@ -375,6 +385,8 @@ DOCKER_ENV_COMMON=(
     -e RUNNER_TYPE=\$RUNNER_TYPE
     -e RESULT_FILENAME=\$RESULT_FILENAME
     -e SPEC_DECODING=\$SPEC_DECODING
+    -e PERFMON_OUTPUT_DIR=/benchmark_logs/perfmon
+    -e PERFMON_SAMPLE_INTERVAL=\$PERFMON_SAMPLE_INTERVAL
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
     -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP
diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
index c28ccab41..1c1be4e47 100755
--- a/benchmarks/multi_node/amd_utils/server_sglang.sh
+++ b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -48,6 +48,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 # =============================================================================
 source $SGLANG_WS_PATH/setup_deps.sh
 source $SGLANG_WS_PATH/env.sh
+# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is
+# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up.
+source "$SGLANG_WS_PATH/../../benchmark_lib.sh"
 
 host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
 host_name=$(hostname)
@@ -279,6 +282,27 @@ done
 echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
 echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
 
+# =============================================================================
+# Per-node measured-power monitor (best-effort)
+# =============================================================================
+# Classify this node into the same worker buckets the role branches below use:
+#   NODE_RANK in [0, NODE_OFFSET)  -> prefill, worker = NODE_RANK / PREFILL_NODES_PER_WORKER
+#   NODE_RANK >= NODE_OFFSET       -> decode,  worker = (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER
+# (NODE_OFFSET = PREFILL_NODES_PER_WORKER * xP.) Node 0 is the proxy too, but
+# its GPUs run the prefill head, so labeling it prefill attributes its energy
+# to the right stage. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's load window.
+if [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    PERF_ROLE="prefill"
+    PERF_WORKER_IDX=$(( NODE_RANK / PREFILL_NODES_PER_WORKER ))
+else
+    PERF_ROLE="decode"
+    PERF_WORKER_IDX=$(( (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER ))
+fi
+if [[ "$DRY_RUN" -ne 1 ]]; then
+    start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX"
+fi
+
 # =============================================================================
 # Configuration Builder Functions
 # =============================================================================
@@ -636,6 +660,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
         echo "ERROR: eval failed; exiting node-0 with rc=1"
+        stop_gpu_monitor
         exit 1
     fi
 
@@ -777,5 +802,8 @@ else
 
 fi
 
+# Stop the per-node power monitor and flush its CSV before the container exits.
+stop_gpu_monitor
+
 echo "Script completed successfully"
 exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
index d61fe0359..4e032dd3e 100755
--- a/benchmarks/multi_node/amd_utils/server_vllm.sh
+++ b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -50,6 +50,9 @@ MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}"
 # Dependencies and Environment Setup
 # =============================================================================
 source $WS_PATH/env.sh
+# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is
+# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up.
+source "$WS_PATH/../../benchmark_lib.sh"
 
 host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
 # RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available)
@@ -214,6 +217,25 @@ done
 echo "Prefill node IPs: ${PREFILL_ARGS}"
 echo "Decode  node IPs: ${DECODE_ARGS}"
 
+# =============================================================================
+# Per-node measured-power monitor (best-effort)
+# =============================================================================
+# vLLM places one worker per node: ranks [0, xP) are prefill (kv_producer),
+# ranks [xP, xP+yD) are decode (kv_consumer) — see the role branches below.
+# Node 0 is the proxy too, but its GPUs run the first prefill worker, so it is
+# correctly labeled prefill. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's load window.
+if [ "$NODE_RANK" -lt "$xP" ]; then
+    PERF_ROLE="prefill"
+    PERF_WORKER_IDX=$NODE_RANK
+else
+    PERF_ROLE="decode"
+    PERF_WORKER_IDX=$(( NODE_RANK - xP ))
+fi
+if [[ "$DRY_RUN" -ne 1 ]]; then
+    start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX"
+fi
+
 # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
 PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
 
@@ -408,6 +430,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
         echo "ERROR: eval failed; exiting node-0 with rc=1"
+        stop_gpu_monitor
         exit 1
     fi
 
@@ -523,5 +546,8 @@ fi
 # kill $etcd_pid 2>/dev/null || true
 # pkill -f etcd 2>/dev/null || true
 
+# Stop the per-node power monitor and flush its CSV before the container exits.
+stop_gpu_monitor
+
 echo "Script completed successfully"
 exit 0
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 9b6cb96a9..b893efa84 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -117,6 +117,43 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     set -x
 
+    # ── Per-node measured-power CSVs ──────────────────────────────────────
+    # Collect these FIRST — immediately after the job completes and before the
+    # result-processing block below, which has early `exit 1` paths (e.g. no
+    # logs dir found). Any early exit fires the EXIT trap (cleanup_and_save_logs),
+    # which `sudo rm -rf`s the whole $BENCHMARK_LOGS_DIR — so anything that needs
+    # to survive must be copied out before then. This mirrors launch_gb300-cw.sh,
+    # which collects srt-slurm's perfmon CSVs right after the job completes.
+    #
+    # Each node's server script (server_sglang.sh / server_vllm.sh) wrote
+    # perf_samples_<role>_w<idx>_<host>.csv into $BENCHMARK_LOGS_DIR/perfmon
+    # (NFS-shared, one file per node). Copy them into the GH workspace and point
+    # the downstream "Process result" step at them via GPU_METRICS_CSV_GLOB so
+    # utils/aggregate_power.py can do the multi-CSV per-worker / per-stage
+    # aggregation. Best-effort: a monitoring hiccup must never fail the upload.
+    PERFMON_SRC_DIR="$BENCHMARK_LOGS_DIR/perfmon"
+    if ls "$PERFMON_SRC_DIR"/perf_samples_*.csv >/dev/null 2>&1; then
+        PERFMON_DST_DIR="$GITHUB_WORKSPACE/perfmon"
+        mkdir -p "$PERFMON_DST_DIR"
+        cp "$PERFMON_SRC_DIR"/perf_samples_*.csv "$PERFMON_DST_DIR"/ 2>/dev/null \
+            || sudo cp "$PERFMON_SRC_DIR"/perf_samples_*.csv "$PERFMON_DST_DIR"/ 2>/dev/null \
+            || true
+        # CSVs may be root-owned on NFS (containers run as root); make them
+        # readable by the runner user for the Process result step.
+        sudo chown -R "$(id -u):$(id -g)" "$PERFMON_DST_DIR" 2>/dev/null || true
+        perf_csv_count=$(ls "$PERFMON_DST_DIR"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ')
+        if [ "$perf_csv_count" -gt 0 ]; then
+            echo "[perfmon] Collected $perf_csv_count per-node perf_samples_*.csv -> $PERFMON_DST_DIR"
+            if [ -n "${GITHUB_ENV:-}" ]; then
+                echo "GPU_METRICS_CSV_GLOB=$PERFMON_DST_DIR/perf_samples_*.csv" >> "$GITHUB_ENV"
+            fi
+        else
+            echo "[perfmon] WARNING: perf_samples_*.csv present under $PERFMON_SRC_DIR but none copied to $PERFMON_DST_DIR — measured power aggregation will be skipped"
+        fi
+    else
+        echo "[perfmon] No perf_samples_*.csv found under $PERFMON_SRC_DIR — measured power aggregation will be skipped"
+    fi
+
     # FIXME: The below is bad and is a result of the indirection of the ways in which
     # Dynamo jobs are launched. In a follow-up PR, the location of the result file should not
     # depend on the runner, it should always be in the same spot in the GH workspace.
@@ -182,6 +219,7 @@ PY
     fi
 
     echo "All result files processed"
+
     # Use sync scancel to ensure nfs file handle is released in time
     set +x
     scancel_sync $JOB_ID

From dea49cd9745c2899fbc0cf0baa8ed96dcc8d53e9 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 14:32:20 -0700
Subject: [PATCH 10/14] fix(power): address bot review findings on agg
 telemetry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _MEM_EXCLUDE_RE now excludes "clock" and "util" (not just "total"), so
  nvidia-smi's clocks.current.memory (a frequency) and utilization.memory
  (a percent) are no longer mislabeled as avg_mem_used_mb. (cursor[bot] Medium)
- Remove dead _disagg_stage_energies shim — no callers. (cursor[bot] Low)
- Add regression test: mem detection ignores clock/util memory columns.

108 utils tests pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 utils/aggregate_power.py      | 23 ++++++++++-------------
 utils/test_aggregate_power.py | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index ee4327b83..0e3c6bc60 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -65,8 +65,10 @@
     "temperature". srt-slurm: "temp_c". Unit: Celsius.
   - Utilization: column name starts with "utilization" or contains "util".
     NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent.
-  - Memory: column name contains "mem" but not "total" (avoid "memory.total").
-    NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB.
+  - Memory: column name contains "mem" but not "total"/"clock"/"util" — so
+    "memory.total", "clocks.current.memory" (a frequency), and
+    "utilization.memory" (a percent) are all rejected; only memory *used* is
+    picked. NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB.
 
 Power is required for aggregation to fire; the other metrics degrade gracefully
 when their columns are absent (those fields are simply omitted from the output).
@@ -94,7 +96,12 @@
 _TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE)
 _UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE)
 _MEM_COL_RE = re.compile(r"mem", re.IGNORECASE)
-_MEM_EXCLUDE_RE = re.compile(r"total", re.IGNORECASE)
+# Exclude "total" (memory.total), "clock" (clocks.current.memory — a frequency,
+# not memory used), and "util" (utilization.memory — a percent). nvidia-smi's
+# query emits clocks.current.memory BEFORE any used-memory column, so without
+# these excludes _MEM_COL_RE would grab the memory *clock* (~2500 MHz) as
+# avg_mem_used_mb.
+_MEM_EXCLUDE_RE = re.compile(r"total|clock|util", re.IGNORECASE)
 _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE)
 _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE)
 _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?")
@@ -690,16 +697,6 @@ def _disagg_stage_rollup(
     }
 
 
-# Backward-compat shim — the original API returned just the two energy values.
-def _disagg_stage_energies(
-    workers: list[dict], duration: float
-) -> tuple[float, float] | None:
-    res = _disagg_stage_rollup(workers, duration)
-    if res is None:
-        return None
-    return res["prefill_energy_j"], res["decode_energy_j"]
-
-
 def run(
     csv_path: Path | Iterable[Path],
     bench_result: Path,
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index fb33ea265..bccf1a98e 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -1205,6 +1205,24 @@ def test_detect_all_columns_excludes_memory_total():
     assert cols["mem"] == "memory.used [MiB]"
 
 
+def test_detect_all_columns_mem_ignores_clock_and_util_memory():
+    """The real nvidia-smi query has NO used-memory column — only
+    clocks.current.memory (a frequency) and utilization.memory (a percent),
+    both of which contain "mem". Neither is memory *used*, so the mem column
+    must resolve to None rather than mislabeling the memory clock as
+    avg_mem_used_mb. Regression for the r"mem" over-match."""
+    header = [
+        "timestamp", "index", "power.draw [W]", "temperature.gpu",
+        "clocks.current.sm [MHz]", "clocks.current.memory [MHz]",
+        "utilization.gpu [%]", "utilization.memory [%]",
+    ]
+    cols = _detect_all_columns(header)
+    assert cols["mem"] is None, f"mem should be None, got {cols['mem']!r}"
+    # The real used-memory column, when present, is still picked.
+    cols2 = _detect_all_columns(header + ["memory.used [MiB]"])
+    assert cols2["mem"] == "memory.used [MiB]"
+
+
 def test_detect_all_columns_missing_optional_metrics():
     """Only power present — temp/util/mem all None."""
     header = ["timestamp", "index", "power.draw [W]"]

From 1135f67d319a38b4fe31607681b0d9fd8f1fbf1a Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 15:02:26 -0700
Subject: [PATCH 11/14] feat(power): joules_per_output_token = per-stage decode
 for disagg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per reviewer: in disagg serving, attribute each token type to only its
stage's GPUs — input tokens to prefill GPUs, output tokens to decode GPUs
(symmetric). joules_per_output_token is now decode_energy / output_tokens
for disagg (was cluster-wide); joules_per_input_token already used prefill
energy / input_tokens. joules_per_total_token stays cluster-wide (overall
efficiency). Single-node / non-disagg / single-stage keep the cluster-wide
output ratio so the field is always populated.

Removes the now-redundant joules_per_output_token_decode key (folded into
joules_per_output_token). Docstring, CLI help, and tests updated; 108 pass.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 utils/aggregate_power.py      | 91 ++++++++++++++++-------------------
 utils/test_aggregate_power.py | 47 +++++++++---------
 utils/test_process_result.py  | 13 ++---
 3 files changed, 70 insertions(+), 81 deletions(-)

diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index 0e3c6bc60..5fb08ef44 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -9,12 +9,12 @@
 
 Cluster-wide fields (always written when any power data exists):
   - avg_power_w:               mean per-GPU power draw (W) during the load window
-  - joules_per_output_token:   total_system_energy / total_output_tokens
-                               (cluster-wide; always — same math single-node and
-                               multinode disagg, so the metric stays comparable
-                               across topologies in the dashboard)
+  - joules_per_output_token:   energy / total_output_tokens. CLUSTER-WIDE
+                               (total_system_energy) on single-node / non-disagg;
+                               OVERRIDDEN to per-stage decode_energy for disagg
+                               (see below).
   - joules_per_total_token:    total_system_energy / (input + output) tokens
-                               (cluster-wide; always)
+                               (cluster-wide; always — overall efficiency number)
   - avg_temp_c:                mean per-GPU temperature (Celsius), when the
                                CSV exposes a temperature column
   - peak_temp_c:               max instantaneous per-GPU temperature in window
@@ -22,22 +22,18 @@
   - avg_mem_used_mb:           mean per-GPU memory used (MiB/MB)
 
 For disaggregated multinode runs (DISAGG=true) where filenames carry the perfmon
-role/index encoding AND both prefill+decode workers are present, additional flat
-per-stage scalars are emitted alongside (NOT instead of) the cluster-wide keys:
-
-  - prefill_avg_power_w:           per-GPU mean power across prefill workers
-  - decode_avg_power_w:            per-GPU mean power across decode workers
-  - joules_per_input_token:        prefill_energy / total_input_tokens
-                                   (per-stage attribution — prefill processes
-                                   input tokens, so its energy / input gives the
-                                   prefill-side per-token cost)
-  - joules_per_output_token_decode: decode_energy / total_output_tokens
-                                   (per-stage attribution; the _decode suffix is
-                                   load-bearing — keeps the cluster-wide
-                                   joules_per_output_token comparable across
-                                   single-node and disagg deployments and exposes
-                                   decode-only energy as a separate key for users
-                                   who specifically want it.)
+role/index encoding AND both prefill+decode workers are present, the per-token
+energy metrics use PER-STAGE attribution — each token type is divided by only the
+GPUs of the stage that produces it (the standard disagg-serving convention):
+
+  - joules_per_input_token:    prefill_energy / total_input_tokens — input tokens
+                               are processed by the prefill GPUs only.
+  - joules_per_output_token:   decode_energy / total_output_tokens — output tokens
+                               are produced by the decode GPUs only. (For
+                               single-node / non-disagg this stays the cluster-wide
+                               total_system_energy / output_tokens.)
+  - prefill_avg_power_w:       per-GPU mean power across prefill workers
+  - decode_avg_power_w:        per-GPU mean power across decode workers
 
 Per-worker breakdown (multinode only — single-node has no role concept), emitted
 under the `workers` key to match InferenceX-app's BenchmarkRow.workers shape:
@@ -607,7 +603,6 @@ def patch_agg_result(
     joules_per_output_token: float,
     joules_per_total_token: float,
     joules_per_input_token: float | None = None,
-    joules_per_output_token_decode: float | None = None,
     prefill_avg_power_w: float | None = None,
     decode_avg_power_w: float | None = None,
     avg_temp_c: float | None = None,
@@ -629,8 +624,6 @@ def patch_agg_result(
     data["joules_per_total_token"] = round(joules_per_total_token, 6)
     if joules_per_input_token is not None:
         data["joules_per_input_token"] = round(joules_per_input_token, 6)
-    if joules_per_output_token_decode is not None:
-        data["joules_per_output_token_decode"] = round(joules_per_output_token_decode, 6)
     if prefill_avg_power_w is not None:
         data["prefill_avg_power_w"] = round(prefill_avg_power_w, 3)
     if decode_avg_power_w is not None:
@@ -735,40 +728,39 @@ def run(
     # parse, so aggregate_power_by_worker returns None and the field is omitted.
     workers = aggregate_power_by_worker(paths, start, end)
 
-    # Cluster-wide energy + per-token attribution. We ALWAYS report
-    # joules_per_output_token / joules_per_total_token as cluster-wide ratios
-    # (total_system_energy / token_count), regardless of disagg. This keeps the
-    # metric comparable across single-node, multinode-agg, and multinode-disagg
-    # topologies in the dashboard. Per-stage attribution lives in separate
-    # *_decode / joules_per_input_token keys (only emitted when disagg AND both
-    # stages present).
+    # Per-token energy attribution.
+    #   - joules_per_total_token stays CLUSTER-WIDE on every topology
+    #     (total_system_energy / all tokens) — the overall efficiency number.
+    #   - For disagg with BOTH stages present, joules_per_output_token and
+    #     joules_per_input_token use PER-STAGE energy: output tokens are produced
+    #     by the decode GPUs (decode_energy / output), input tokens by the
+    #     prefill GPUs (prefill_energy / input). This is the standard per-stage
+    #     attribution requested for disagg serving.
+    #   - Single-node / non-disagg / single-stage fall back to the cluster-wide
+    #     output ratio so the field is always populated.
     total_system_energy_j = avg_power_w * num_gpus * duration
     total_tokens = total_output + total_input
-    joules_per_output_token = total_system_energy_j / total_output
+    joules_per_output_token = total_system_energy_j / total_output  # cluster fallback
     joules_per_total_token = (
         total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token
     )
 
     joules_per_input_token: float | None = None
-    joules_per_output_token_decode: float | None = None
     prefill_avg_power_w: float | None = None
     decode_avg_power_w: float | None = None
 
     if disagg and workers is not None:
         stage = _disagg_stage_rollup(workers, duration)
         if stage is not None:
-            # Per-stage attribution: prefill workers process input tokens,
-            # decode workers process output tokens. Strictly more accurate
-            # than total-energy ratios when prefill/decode have different
-            # per-GPU power profiles (typical: prefill is compute-bound and
-            # draws more than memory-bound decode). Exposed as additional
-            # flat scalars so the cluster-wide joules_per_output_token stays
-            # comparable across topologies.
+            # Per-stage attribution: decode GPUs produce output tokens, prefill
+            # GPUs process input tokens. Strictly more accurate than total-energy
+            # ratios when prefill/decode have different per-GPU power profiles
+            # (typical: prefill is compute-bound and draws more than memory-bound
+            # decode). joules_per_output_token is OVERRIDDEN to the decode-only
+            # value here (symmetric with the prefill-only joules_per_input_token).
             prefill_avg_power_w = stage["prefill_avg_power_w"]
             decode_avg_power_w = stage["decode_avg_power_w"]
-            joules_per_output_token_decode = (
-                stage["decode_energy_j"] / total_output
-            )
+            joules_per_output_token = stage["decode_energy_j"] / total_output
             joules_per_input_token = (
                 stage["prefill_energy_j"] / total_input if total_input > 0 else None
             )
@@ -787,7 +779,6 @@ def run(
             joules_per_output_token,
             joules_per_total_token,
             joules_per_input_token=joules_per_input_token,
-            joules_per_output_token_decode=joules_per_output_token_decode,
             prefill_avg_power_w=prefill_avg_power_w,
             decode_avg_power_w=decode_avg_power_w,
             avg_temp_c=avg_temp_c,
@@ -851,12 +842,12 @@ def main() -> int:
     parser.add_argument(
         "--disagg",
         action="store_true",
-        help="Treat as disaggregated inference: emit prefill_avg_power_w, "
-        "decode_avg_power_w, joules_per_input_token, and "
-        "joules_per_output_token_decode using per-stage energy attribution "
-        "(prefill workers' energy / input tokens, decode workers' energy / "
-        "output tokens). Requires CSV filenames to carry the perfmon role/index "
-        "encoding.",
+        help="Treat as disaggregated inference: emit prefill_avg_power_w / "
+        "decode_avg_power_w, and use PER-STAGE energy attribution for "
+        "joules_per_input_token (prefill energy / input tokens) and "
+        "joules_per_output_token (decode energy / output tokens). "
+        "joules_per_total_token stays cluster-wide. Requires CSV filenames to "
+        "carry the perfmon role/index encoding.",
     )
     args = parser.parse_args()
 
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index bccf1a98e..578981d29 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -876,16 +876,16 @@ def test_run_disagg_emits_workers_and_per_stage_joules(tmp_path: Path):
     # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W.
     assert patched["avg_power_w"] == pytest.approx(500.0)
 
-    # Cluster-wide joules (total_system_energy / token_count) — same math as
-    # single-node so the metric stays comparable across topologies.
-    assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000)   # 80.0
+    # joules_per_total_token stays cluster-wide (all energy / all tokens).
     assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)    # ≈ 8.889
 
-    # Per-stage scalars (new): prefill_avg, decode_avg, J/input, J/output_decode.
+    # Per-stage attribution: input divided by prefill energy, output by decode
+    # energy (the disagg convention).
     assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
     assert patched["decode_avg_power_w"] == pytest.approx(400.0)
-    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0
-    assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000)  # 32.0
+    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0 (prefill)
+    assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000)  # 32.0 (decode)
+    assert "joules_per_output_token_decode" not in patched  # folded into joules_per_output_token
 
     # workers[] (renamed from power_by_worker).
     workers = patched["workers"]
@@ -939,17 +939,17 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path):
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
 
-    # Per-stage scalars (frontend excluded).
-    # J/input = 24_000 / 8000 = 3.0.
+    # Per-stage attribution excludes the frontend node.
+    # J/input  = prefill 24_000 / 8000 = 3.0.
+    # J/output = decode  16_000 / 1000 = 16.0 (frontend's 4_000 J NOT counted).
     assert patched["joules_per_input_token"] == pytest.approx(3.0)
-    # J/output_decode = 16_000 / 1000 = 16.0.
-    assert patched["joules_per_output_token_decode"] == pytest.approx(16.0)
+    assert patched["joules_per_output_token"] == pytest.approx(16.0)
     assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
     assert patched["decode_avg_power_w"] == pytest.approx(400.0)
 
-    # Cluster-wide J/output still uses TOTAL energy (incl. frontend).
-    # total energy = (600+400+100) × 4 × 10 = 44_000 J → 44.0 J/output_tok.
-    assert patched["joules_per_output_token"] == pytest.approx(44.0)
+    # But the frontend's energy IS counted in the cluster-wide total efficiency:
+    # total energy = (600+400+100) × 4 × 10 = 44_000 J → / 9000 tokens ≈ 4.889.
+    assert patched["joules_per_total_token"] == pytest.approx(44_000 / 9000)
 
     # Frontend still appears in the worker list for observability.
     roles = [w["role"] for w in patched["workers"]]
@@ -1049,12 +1049,12 @@ def test_run_disagg_handles_zero_input_tokens(tmp_path: Path):
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
     assert "joules_per_input_token" not in patched
-    # Per-stage decode still works — depends only on decode_energy / output.
-    assert patched["joules_per_output_token_decode"] == pytest.approx(16_000 / 1000)
+    # Per-stage output still works — depends only on decode_energy / output.
+    assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000)  # decode
     assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
     assert patched["decode_avg_power_w"] == pytest.approx(400.0)
-    # Cluster-wide J/output uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J.
-    assert patched["joules_per_output_token"] == pytest.approx(40_000 / 1000)
+    # Cluster-wide total uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J / 1000.
+    assert patched["joules_per_total_token"] == pytest.approx(40_000 / 1000)
 
 
 def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path):
@@ -1071,7 +1071,6 @@ def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path):
         joules_per_output_token=40.0,
         joules_per_total_token=4.44,
         joules_per_input_token=3.0,
-        joules_per_output_token_decode=16.0,
         prefill_avg_power_w=600.0,
         decode_avg_power_w=400.0,
         workers=workers,
@@ -1080,7 +1079,6 @@ def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path):
     assert data["avg_power_w"] == 500.0
     assert data["joules_per_output_token"] == 40.0
     assert data["joules_per_input_token"] == 3.0
-    assert data["joules_per_output_token_decode"] == 16.0
     assert data["prefill_avg_power_w"] == 600.0
     assert data["decode_avg_power_w"] == 400.0
     assert data["workers"] == workers
@@ -1610,16 +1608,15 @@ def test_run_disagg_amd_emits_workers_and_per_stage_joules(tmp_path: Path):
     assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0
     patched = json.loads(agg.read_text())
 
-    # Cluster-wide (vendor-agnostic, same math as single-node / NVIDIA).
+    # Cluster-wide total (vendor-agnostic, same math as single-node / NVIDIA).
     assert patched["avg_power_w"] == pytest.approx(500.0)
-    assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000)  # 80.0
     assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000)   # ≈ 8.889
 
-    # Per-stage scalars from amd-smi CSVs.
+    # Per-stage attribution from amd-smi CSVs: input=prefill energy, output=decode energy.
     assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
     assert patched["decode_avg_power_w"] == pytest.approx(400.0)
-    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0
-    assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000)  # 32.0
+    assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000)   # 6.0 (prefill)
+    assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000)  # 32.0 (decode)
 
     # workers[] breakdown.
     workers = patched["workers"]
@@ -1661,6 +1658,6 @@ def test_run_disagg_amd_vllm_topology_one_worker_per_node(tmp_path: Path):
     # 2 prefill workers × 8 GPUs @ 600W → 96_000 J / 8000 input = 12.0.
     assert patched["joules_per_input_token"] == pytest.approx(96_000 / 8000)
     # 2 decode workers × 8 GPUs @ 400W → 64_000 J / 1000 output = 64.0.
-    assert patched["joules_per_output_token_decode"] == pytest.approx(64_000 / 1000)
+    assert patched["joules_per_output_token"] == pytest.approx(64_000 / 1000)
     assert patched["prefill_avg_power_w"] == pytest.approx(600.0)
     assert patched["decode_avg_power_w"] == pytest.approx(400.0)
diff --git a/utils/test_process_result.py b/utils/test_process_result.py
index ad931591b..78f293fd0 100644
--- a/utils/test_process_result.py
+++ b/utils/test_process_result.py
@@ -796,17 +796,18 @@ def test_disagg_multinode_emits_workers_and_per_stage_joules(self, tmp_path, mul
 
         patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text())
 
-        # Per-stage attribution scalars: prefill_energy / input, decode_energy / output.
+        # Per-stage attribution: input divided by prefill energy, output by decode.
         # Prefill: 600 × 4 × 60 = 144_000 J  → / 240_000 = 0.6 J/input_tok.
-        # Decode:  400 × 4 × 60 =  96_000 J  → /  30_000 = 3.2 J/output_tok_decode.
+        # Decode:  400 × 4 × 60 =  96_000 J  → /  30_000 = 3.2 J/output_tok.
         assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01)
-        assert patched["joules_per_output_token_decode"] == pytest.approx(3.2, abs=0.01)
+        assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01)  # decode
+        assert "joules_per_output_token_decode" not in patched
         assert patched["prefill_avg_power_w"] == pytest.approx(600.0, abs=0.5)
         assert patched["decode_avg_power_w"] == pytest.approx(400.0, abs=0.5)
 
-        # Cluster-wide J/output (frontend would be incl. here too if present).
-        # Total energy = (600+400) × 4 × 60 = 240_000 J → / 30_000 = 8.0 J/output_tok.
-        assert patched["joules_per_output_token"] == pytest.approx(8.0, abs=0.05)
+        # Cluster-wide total efficiency still counts ALL energy.
+        # Total energy = (600+400) × 4 × 60 = 240_000 J → / 270_000 ≈ 0.889 J/total_tok.
+        assert patched["joules_per_total_token"] == pytest.approx(240_000 / 270_000, abs=0.01)
 
         # Per-worker breakdown labeled with role.
         workers = patched["workers"]

From 30f1c2131b5481928559c5755ffdba3c06c1b508 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 15:13:53 -0700
Subject: [PATCH 12/14] ci(multinode): drop root-owned benchmark_logs before
 checkout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The AMD multinode container runs as root and writes benchmark_logs/. If a
job is cancelled (e.g. concurrency supersede), its cleanup trap never runs,
leaving root-owned dirs. actions/checkout (clean: true) then can't rmdir
them (EACCES) and fails BEFORE the job starts — poison-failing every job
scheduled onto that runner. Add `sudo rm -rf $GITHUB_WORKSPACE/benchmark_logs`
to the shared Slurm-cleanup anchor (runs pre-checkout AND post-run) so a
dirty runner self-heals.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/benchmark-multinode-tmpl.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index f901b1ff7..b2eec27e6 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -178,6 +178,13 @@ jobs:
               sleep 5
             done
           fi
+          # Drop root-owned leftovers from a prior (often cancelled) multinode
+          # run. The benchmark container runs as root and writes benchmark_logs/;
+          # if the job was cancelled its cleanup trap never ran, leaving
+          # root-owned dirs that actions/checkout (clean: true) can't rmdir
+          # (EACCES) — which then poison-fails EVERY subsequent job on that
+          # runner. Runs in both pre- and post-run cleanup (shared anchor).
+          sudo rm -rf "${GITHUB_WORKSPACE}/benchmark_logs" 2>/dev/null || true
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:

From 488ce46d1f1cd75553749541481bf1d2706eece5 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 15:38:51 -0700
Subject: [PATCH 13/14] fix(launcher): tolerate transient squeue timeouts in
 mi355x job poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A full sweep floods slurmctld, so `squeue` intermittently returns
"slurm_load_jobs error: Socket timed out". The old liveness check
(`! squeue ... | grep -q $JOB_ID`) treated that empty/failed output as
"job died" and exit 1'd — a false failure on a healthy job (observed on
dsr1-fp8-mi355x-sglang-disagg conc 1024x2048).

Add job_alive(): a non-zero squeue exit is treated as "still alive" (don't
false-fail on a scheduler blip); only a SUCCESSFUL squeue that omits the
job — re-checked once to avoid a single-sample race — counts as gone. Used
by both the wait-for-log loop and the completion poll.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 runners/launch_mi355x-amds.sh | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index b893efa84..cc4544962 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -90,11 +90,27 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     # Give slurm time to start the job and create log file
     sleep 10
 
+    # Whether $JOB_ID is still in the SLURM queue, resilient to transient
+    # slurmctld timeouts ("slurm_load_jobs error: Socket timed out") — common
+    # when a full sweep floods the controller. A FAILED squeue (non-zero exit)
+    # is treated as "still alive" so a scheduler blip can't be misread as job
+    # death; only a SUCCESSFUL squeue that omits the job means it's gone, and we
+    # re-check once before declaring it gone to avoid a single-sample race.
+    job_alive() {
+        local out rc
+        out=$(squeue -u "$USER" --noheader --format='%i' 2>/dev/null); rc=$?
+        [[ $rc -ne 0 ]] && return 0          # scheduler hiccup → assume alive
+        grep -qw "$JOB_ID" <<<"$out" && return 0
+        sleep 5
+        out=$(squeue -u "$USER" --noheader --format='%i' 2>/dev/null) || return 0
+        grep -qw "$JOB_ID" <<<"$out"
+    }
+
     # Wait for log file to appear (also check job is still alive)
     while ! ls "$LOG_FILE" &>/dev/null; do
-        if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then
-            echo "ERROR: Job $JOB_ID failed before creating log file"
-            scontrol show job "$JOB_ID"
+        if ! job_alive; then
+            echo "ERROR: Job $JOB_ID is no longer in the queue and never created a log file"
+            scontrol show job "$JOB_ID" 2>/dev/null || true
             exit 1
         fi
         sleep 5
@@ -102,9 +118,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
 
     set +x
 
-    # Poll for job completion in background
+    # Poll for job completion in background (tolerant of transient squeue
+    # timeouts via job_alive — a scheduler blip must not look like completion).
     (
-        while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do
+        while job_alive; do
             sleep 10
         done
     ) &

From 6849229d67ac0a42cf30e68226c57ce0256ac2ce Mon Sep 17 00:00:00 2001
From: Aryan <aryan@gupta-inc.com>
Date: Thu, 28 May 2026 15:54:46 -0700
Subject: [PATCH 14/14] feat(power): capture AMD temp/util/mem (gfx_activity,
 used_vram, hotspot)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The amd-smi monitor only ran `metric -p -c -t -u`, so no VRAM column was
emitted and avg_mem_used_mb never populated on AMD. It also used util/mem
column matchers tuned for NVIDIA/srt-slurm names, which miss amd-smi's
conventions — so avg_util_pct and avg_temp_c silently dropped too.

- benchmark_lib.sh: add `-m` (mem-usage) to the amd-smi command so a
  used_vram column is captured.
- aggregate_power.py column detection:
  - util: also match amd-smi `gfx_activity` (umc/mm_activity excluded).
  - mem: match positively on memory/vram + "used" instead of broad "mem"
    minus a growing exclude list — picks memory.used / mem_used_mb /
    used_vram while rejecting mem_temperature, mem_voltage, total/free_vram,
    the memory clock, and utilization.memory.
  - temp: prefer hotspot/junction over the first temp column, since edge
    temperature reads N/A on data-center AMD parts (MI300/MI355).

NVIDIA and srt-slurm detection is unchanged (verified by existing tests).
Adds AMD-header detection tests; full suite 111 passed.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 benchmarks/benchmark_lib.sh   | 10 ++++---
 utils/aggregate_power.py      | 56 ++++++++++++++++++++++-------------
 utils/test_aggregate_power.py | 40 +++++++++++++++++++++++++
 3 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
index 747b445c0..2c0d881a1 100644
--- a/benchmarks/benchmark_lib.sh
+++ b/benchmarks/benchmark_lib.sh
@@ -41,9 +41,11 @@ start_gpu_monitor() {
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
     elif command -v amd-smi &>/dev/null; then
-        # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage,
-        # -w <interval> native watch mode (emits a timestamp column per sample),
-        # --csv. The awk filter keeps the first CSV header line and drops
+        # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage
+        # (gfx_activity), -m mem-usage (used_vram), -w <interval> native watch
+        # mode (emits a timestamp column per sample), --csv. Without -m there is
+        # no VRAM column, so avg_mem_used_mb would never populate on AMD.
+        # The awk filter keeps the first CSV header line and drops
         # amd-smi's preamble / repeated headers. Header match is case-insensitive
         # (tolower) so a capitalized "Timestamp," header — should amd-smi ever
         # emit one — still passes through; aggregate_power's column detection is
@@ -51,7 +53,7 @@ start_gpu_monitor() {
         # clock, so multinode aggregation assumes cluster clocks are NTP-synced
         # (same assumption as nvidia-smi; aggregate_power windows by absolute
         # epoch from benchmark_serving.py).
-        amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \
+        amd-smi metric -p -c -t -u -m -w "$interval" --csv 2>/dev/null \
             | awk 'tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py
index 5fb08ef44..9efcf2fd0 100644
--- a/utils/aggregate_power.py
+++ b/utils/aggregate_power.py
@@ -57,14 +57,18 @@
   - Power: timestamp + column whose name contains "power" (excluding
     "limit"/"cap"/"max"/"min"). NVIDIA: "power.draw [W]". AMD: "socket_power".
     srt-slurm: "power_w".
-  - Temperature: column name contains "temp". NVIDIA: "temperature.gpu". AMD:
-    "temperature". srt-slurm: "temp_c". Unit: Celsius.
-  - Utilization: column name starts with "utilization" or contains "util".
+  - Temperature: column name contains "temp"; hotspot/junction columns are
+    preferred over the first match because data-center AMD parts report edge
+    temperature as N/A. NVIDIA: "temperature.gpu". AMD amd-smi: "edge_temperature"
+    / "hotspot_temperature" (junction picked). srt-slurm: "temp_c". Unit: Celsius.
+  - Utilization: column starts with "utilization" or contains "util", or is
+    amd-smi's "gfx_activity" (umc_activity / mm_activity are not matched).
     NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent.
-  - Memory: column name contains "mem" but not "total"/"clock"/"util" — so
-    "memory.total", "clocks.current.memory" (a frequency), and
-    "utilization.memory" (a percent) are all rejected; only memory *used* is
-    picked. NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB.
+  - Memory used: column mentions memory/vram AND "used" — picks NVIDIA
+    "memory.used [MiB]", srt-slurm "mem_used_mb", amd-smi "used_vram"; rejects
+    decoys lacking "used" (memory.total / total_vram / free_vram, the memory
+    *clock* "clocks.current.memory", utilization.memory, mem_temperature,
+    mem_voltage). Unit: MiB/MB.
 
 Power is required for aggregation to fire; the other metrics degrade gracefully
 when their columns are absent (those fields are simply omitted from the output).
@@ -90,14 +94,24 @@
 _POWER_COL_RE = re.compile(r"power", re.IGNORECASE)
 _POWER_EXCLUDE_RE = re.compile(r"limit|cap|max|min", re.IGNORECASE)
 _TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE)
-_UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE)
-_MEM_COL_RE = re.compile(r"mem", re.IGNORECASE)
-# Exclude "total" (memory.total), "clock" (clocks.current.memory — a frequency,
-# not memory used), and "util" (utilization.memory — a percent). nvidia-smi's
-# query emits clocks.current.memory BEFORE any used-memory column, so without
-# these excludes _MEM_COL_RE would grab the memory *clock* (~2500 MHz) as
-# avg_mem_used_mb.
-_MEM_EXCLUDE_RE = re.compile(r"total|clock|util", re.IGNORECASE)
+# Data-center AMD parts (MI300/MI355) report edge temperature as N/A and expose
+# the real die temperature as hotspot/junction; prefer those when present so
+# avg_temp_c isn't computed over an all-N/A edge column. NVIDIA's single
+# "temperature.gpu" and srt-slurm's "temp_c" have neither token and fall through
+# to the first temperature column unchanged.
+_TEMP_PREFER_RE = re.compile(r"hotspot|junction", re.IGNORECASE)
+# Utilization: NVIDIA "utilization.gpu", srt-slurm "util_pct", AMD amd-smi
+# "gfx_activity" (the GPU/graphics-engine busy percent). amd-smi's other usage
+# columns — umc_activity (memory controller), mm_activity (multimedia) — are
+# intentionally NOT matched so gfx_activity is the one picked.
+_UTIL_COL_RE = re.compile(r"^utilization|util|gfx_activity", re.IGNORECASE)
+# Memory *used*: match positively on a column that mentions both memory/vram and
+# "used" rather than broad "mem" + a growing exclude list. This naturally picks
+# NVIDIA "memory.used [MiB]", srt-slurm "mem_used_mb", and amd-smi "used_vram"
+# while rejecting same-prefix decoys that lack "used": memory.total / total_vram /
+# free_vram, clocks.current.memory (a frequency), utilization.memory (a percent),
+# and amd-smi's mem_temperature / mem_voltage.
+_MEM_COL_RE = re.compile(r"(?:mem|vram).*used|used.*(?:mem|vram)", re.IGNORECASE)
 _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE)
 _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE)
 _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?")
@@ -208,12 +222,14 @@ def _detect_all_columns(header: list[str]) -> dict[str, str | None]:
         (c for c in header if _POWER_COL_RE.search(c) and not _POWER_EXCLUDE_RE.search(c)),
         None,
     )
-    temp_col = next((c for c in header if _TEMP_COL_RE.search(c)), None)
-    util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None)
-    mem_col = next(
-        (c for c in header if _MEM_COL_RE.search(c) and not _MEM_EXCLUDE_RE.search(c)),
-        None,
+    temp_cols = [c for c in header if _TEMP_COL_RE.search(c)]
+    # Prefer hotspot/junction (the real die temp on data-center AMD parts) over
+    # the first temperature column (edge on AMD, temperature.gpu on NVIDIA).
+    temp_col = next((c for c in temp_cols if _TEMP_PREFER_RE.search(c)), None) or (
+        temp_cols[0] if temp_cols else None
     )
+    util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None)
+    mem_col = next((c for c in header if _MEM_COL_RE.search(c)), None)
     gpu_col = next((c for c in header if _GPU_INDEX_COL_RE.match(c.strip())), None)
     return {
         "timestamp": timestamp_col,
diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py
index 578981d29..47ea5a452 100644
--- a/utils/test_aggregate_power.py
+++ b/utils/test_aggregate_power.py
@@ -1196,6 +1196,46 @@ def test_detect_all_columns_amd_style():
     assert cols["mem"] is None
 
 
+def test_detect_all_columns_amd_smi_full():
+    """Real amd-smi `metric -p -c -t -u -m --csv` header on a data-center part.
+
+    Exercises: gfx_activity as util (not umc/mm_activity), used_vram as mem (not
+    total/free_vram, mem_clock, mem_voltage, or mem_temperature), and
+    hotspot_temperature preferred over the N/A edge_temperature.
+    """
+    header = [
+        "timestamp", "gpu",
+        "socket_power", "mem_voltage",            # -p
+        "gfx_clock", "mem_clock",                 # -c
+        "edge_temperature", "hotspot_temperature", "mem_temperature",  # -t
+        "gfx_activity", "umc_activity", "mm_activity",                 # -u
+        "total_vram", "used_vram", "free_vram",   # -m
+    ]
+    cols = _detect_all_columns(header)
+    assert cols["power"] == "socket_power"
+    assert cols["util"] == "gfx_activity"
+    assert cols["mem"] == "used_vram"
+    # Hotspot/junction preferred over edge (edge reads N/A on MI300/MI355).
+    assert cols["temp"] == "hotspot_temperature"
+
+
+def test_detect_all_columns_temp_prefers_junction():
+    """junction_temperature wins over a leading edge_temperature column."""
+    header = ["timestamp", "gpu", "socket_power",
+              "edge_temperature", "junction_temperature"]
+    assert _detect_all_columns(header)["temp"] == "junction_temperature"
+
+
+def test_detect_all_columns_mem_vram_used_variants():
+    """Both used_vram and vram_used resolve; total/free_vram never do."""
+    assert _detect_all_columns(
+        ["timestamp", "power_w", "total_vram", "vram_used", "free_vram"]
+    )["mem"] == "vram_used"
+    assert _detect_all_columns(
+        ["timestamp", "power_w", "total_vram", "free_vram"]
+    )["mem"] is None
+
+
 def test_detect_all_columns_excludes_memory_total():
     """memory.total must not be picked as the memory column (we want USED memory)."""
     header = ["timestamp", "index", "power.draw [W]", "memory.total [MiB]", "memory.used [MiB]"]