From 5e6533d563ebc68df4d11ef617235c90620460f9 Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 27 May 2026 12:20:58 -0700 Subject: [PATCH 01/14] feat(power): extend measured-power aggregation to multinode srt-slurm runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on PR #1558 (single-node measured-power) for multinode benchmarks via srt-slurm. Pipeline: srt-slurm perfmon (per-node nvidia-smi sampling — PR #35 on NVIDIA/srt-slurm, layered on SemiAnalysisAI/srt-slurm:feat/inferencex-perfmon) perf_samples_.csv in outputs//logs/ on shared NFS launch_gb300-cw.sh exports GPU_METRICS_CSV_GLOB to $GITHUB_ENV process_result.py expands the glob and hands the list to aggregate_power.run() aggregate_power.py namespaces local GPU indices per source CSV stem so each node's local indices 0..N-1 don't collide across nodes; emits cluster-wide avg_power_w + joules_per_*_token InferenceX-app ETL auto-captures the numeric fields (no schema change) Changes: - utils/aggregate_power.py: widen csv_path to Path | Iterable[Path] keeping the original param name. Per-source GPU-id namespacing only kicks in when there are 2+ sources so single-node num_gpus is unchanged. CLI adds --csv-glob (Python-side glob, mutually exclusive with --csv). - utils/process_result.py: bridge GPU_METRICS_CSV_GLOB env var. Glob takes precedence over single GPU_METRICS_CSV when both are set. - runners/launch_gb300-cw.sh: point dynamo-sglang at our srt-slurm fork, append `monitoring:` block to each recipe post-copy (idempotent), and write GPU_METRICS_CSV_GLOB to $GITHUB_ENV after the job for the downstream Process result step. - 8 new multinode tests in test_aggregate_power.py (per-source namespacing, sub-second clock drift, asymmetric prefill/decode power, missing-CSV silent skip, backward-compat single-path-in-list, Iterable acceptance, E2E run with list). 3 new in test_process_result.py (glob aggregation, precedence over single CSV, empty-match falls through). 64/64 pass. Verified data-format end-to-end on gb300 hardware: nvidia-smi inside the sglang container emits the columns aggregate_power.py needs timestamp, gpu, power_w. --- runners/launch_gb300-cw.sh | 41 +++++- utils/aggregate_power.py | 225 ++++++++++++++++++++++---------- utils/process_result.py | 43 +++++-- utils/test_aggregate_power.py | 236 +++++++++++++++++++++++++++++++++- utils/test_process_result.py | 105 +++++++++++++++ 5 files changed, 565 insertions(+), 85 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 25e7f4db5..9f3222dad 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -12,8 +12,13 @@ if [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then export MODEL_PATH="/mnt/vast/models/dsv4" if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - SRT_SLURM_RECIPES_REPO="https://github.com/NVIDIA/srt-slurm.git" - SRT_SLURM_RECIPES_REF="main" + # Pinned to our SemiAnalysisAI fork of NVIDIA/srt-slurm to pick up + # PR #35 (per-node nvidia-smi monitoring during the benchmark sweep) + # ahead of its upstream merge. The branch tracks PR #35's head SHA: + # to bump, re-fetch refs/pull/35/head from NVIDIA/srt-slurm and force- + # push to SemiAnalysisAI/srt-slurm:feat/inferencex-perfmon. + SRT_SLURM_RECIPES_REPO="https://github.com/SemiAnalysisAI/srt-slurm.git" + SRT_SLURM_RECIPES_REF="feat/inferencex-perfmon" SRT_RECIPE_SRC="$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4" SRT_RECIPE_DST="recipes/sglang/deepseek-v4" elif [[ $FRAMEWORK == "dynamo-vllm" ]]; then @@ -106,6 +111,19 @@ git checkout "$SRT_SLURM_RECIPES_REF" mkdir -p "$SRT_RECIPE_DST" cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST" +# Enable per-node GPU perfmon (PR #35) on every overlaid recipe. `monitoring` +# is a top-level SrtConfig field and defaults to None, so without this the +# orchestrator's _start_perf_monitor short-circuits and no perf_samples_*.csv +# are ever written — multinode measured-power aggregation would silently +# skip. Idempotent: skips recipes that already declare `monitoring:`. +for recipe in "$SRT_RECIPE_DST"/*.yaml; do + [ -f "$recipe" ] || continue + if ! grep -q '^monitoring:' "$recipe"; then + printf '\nmonitoring:\n enabled: true\n sample_interval: 1.0\n' >> "$recipe" + echo "[perfmon] enabled monitoring in recipe: $recipe" + fi +done + echo "Installing srtctl..." # CRITICAL — uv install location. # Runner pod is x86 but compute nodes are aarch64, and /mnt/home is @@ -279,6 +297,25 @@ else echo "Warning: Logs directory not found at $LOGS_DIR" fi +# Hand the per-node perfmon CSVs off to the downstream "Process result" step +# in benchmark-multinode-tmpl.yml. srt-slurm's perfmon (PR #35) writes +# perf_samples_{node}.csv straight into $LOGS_DIR on the host. process_result.py +# already invokes aggregate_power.run() inline; teaching it to read +# GPU_METRICS_CSV_GLOB lets utils/aggregate_power.py do the multi-CSV +# aggregation (each agg JSON gets avg_power_w / joules_per_*_token patched in +# place). Use an absolute glob because process_result.py runs from +# $GITHUB_WORKSPACE, not from this srt-slurm checkout. +if [ -d "$LOGS_DIR" ]; then + perf_glob_dir="$(pwd)/$LOGS_DIR" + perf_csv_count=$(ls "$perf_glob_dir"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ') + if [ "$perf_csv_count" -gt 0 ]; then + echo "[perfmon] Found $perf_csv_count per-node perf_samples_*.csv under $perf_glob_dir/" + echo "GPU_METRICS_CSV_GLOB=$perf_glob_dir/perf_samples_*.csv" >> "$GITHUB_ENV" + else + echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv found in $perf_glob_dir — measured power aggregation will be skipped" + fi +fi + if [[ "${EVAL_ONLY:-false}" != "true" ]]; then if [ ! -d "$LOGS_DIR" ]; then exit 1 diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index 3c204085a..ab6fcef3e 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -1,12 +1,19 @@ """Aggregate measured GPU power from a vendor SMI CSV into the agg result JSON. -Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi), -filters samples to the benchmark load window using start/end Unix timestamps -written by benchmark_serving.py, and patches two keys into the aggregated -result JSON consumed by InferenceX-app's ETL: +Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi) +or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark +load window using start/end Unix timestamps written by benchmark_serving.py, and +patches three keys into the aggregated result JSON consumed by InferenceX-app's +ETL: - avg_power_w: mean per-GPU power draw (W) during the load window - joules_per_output_token: (avg_power_w * num_gpus * duration_s) / total_output_tokens + - joules_per_total_token: same, divided by (input + output) tokens + +Multinode: accepts multiple CSV paths (one per worker node). GPU indices are +namespaced by source CSV stem to avoid the same-index collision across nodes — +e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4 +total GPUs instead of 32. The ETL (`packages/db/src/etl/benchmark-mapper.ts`) auto-captures any numeric field in the agg JSON into the `metrics` JSONB column, so no schema migration @@ -14,8 +21,8 @@ Vendor schema detection is regex-based: any timestamp-like column + any column whose name contains "power" (excluding "limit"/"cap"/"max") is picked up. -NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version. Both are -handled. +NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's +perfmon emits "power_w". All are handled. This script is best-effort. Missing or malformed CSV exits 0 without patching so a monitoring hiccup never breaks the benchmark upload. @@ -25,9 +32,11 @@ import argparse import csv +import glob as glob_module import json import re import sys +from collections.abc import Iterable from datetime import datetime, timezone from pathlib import Path from statistics import mean @@ -109,74 +118,84 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No def aggregate_power( - csv_path: Path, + csv_path: Path | Iterable[Path], start_unix: float, end_unix: float, ) -> tuple[float, int] | None: """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end]. - Returns None if the CSV is missing, empty, has no detectable power column, - or no rows fall in the window. + Accepts either a single Path (single-node case) or an iterable of Paths + (multinode case: one CSV per worker node, all written by srt-slurm's + perfmon). For multi-path inputs, GPU indices are namespaced by source + CSV stem so the distinct-id count reflects the true total — each node + independently reports indices 0..N, and without namespacing the union + would collapse to a single node's worth. + + Returns None if no CSVs are usable, none have a detectable power column, + or no rows fall in the window across all paths. """ - if not csv_path.is_file() or csv_path.stat().st_size == 0: - return None - if end_unix <= start_unix: + paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) + if not paths or end_unix <= start_unix: return None - try: - with csv_path.open("r", newline="", encoding="utf-8", errors="replace") as f: - reader = csv.DictReader(f, skipinitialspace=True) - header = [c.strip() for c in (reader.fieldnames or [])] - reader.fieldnames = header - timestamp_col, power_col, gpu_col = _detect_columns(header) - if not timestamp_col or not power_col: - return None - - # Group power readings by sample timestamp so per-sample total power - # (sum across GPUs) is computed correctly even if rows are interleaved. - # - # per_sample_row_count is the structural divisor: it's incremented for - # every contributing row regardless of whether a GPU-index column was - # detected. per_sample_gpus / gpu_keys are only populated when gpu_col - # is present and provide the canonical num_gpus via distinct-id count. - # When gpu_col is absent (vendor schema variant whose header doesn't - # match _GPU_INDEX_COL_RE), we fall back to inferring num_gpus from - # the modal row count per timestamp — assuming one row per GPU per - # sample, which is what every SMI tool we've seen actually emits. - per_sample_total: dict[float, float] = {} - per_sample_row_count: dict[float, int] = {} - per_sample_gpus: dict[float, set[str]] = {} - gpu_keys: set[str] = set() - - for row in reader: - ts_raw = (row.get(timestamp_col) or "").strip() - pw_raw = (row.get(power_col) or "").strip() - ts = _parse_timestamp(ts_raw) - pw = _parse_power(pw_raw) - if ts is None or pw is None: - continue - if ts < start_unix or ts > end_unix: + # Only namespace when there are multiple sources — keeps single-node + # gpu_keys identical to the pre-multinode behavior so existing callers + # see the same num_gpus values. + namespace = len(paths) > 1 + + # Per-sample state accumulates across ALL paths. Bucketed by ms-rounded + # timestamp so nodes whose clocks drift sub-ms still end up in the same + # bucket (they reliably do — all sample on `time.sleep(interval)` against + # the same NTP-synced cluster clock). + per_sample_total: dict[float, float] = {} + per_sample_row_count: dict[float, int] = {} + per_sample_gpus: dict[float, set[str]] = {} + gpu_keys: set[str] = set() + saw_gpu_col = False + + for path in paths: + if not path.is_file() or path.stat().st_size == 0: + continue + try: + with path.open("r", newline="", encoding="utf-8", errors="replace") as f: + reader = csv.DictReader(f, skipinitialspace=True) + header = [c.strip() for c in (reader.fieldnames or [])] + reader.fieldnames = header + timestamp_col, power_col, gpu_col = _detect_columns(header) + if not timestamp_col or not power_col: continue - # Bucket by sample timestamp (rounded to ms to absorb sub-ms drift). - bucket = round(ts, 3) - per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw - per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1 if gpu_col: - gpu_id = (row.get(gpu_col) or "").strip() - if gpu_id: - per_sample_gpus.setdefault(bucket, set()).add(gpu_id) - gpu_keys.add(gpu_id) - except (OSError, csv.Error): - return None + saw_gpu_col = True + + for row in reader: + ts_raw = (row.get(timestamp_col) or "").strip() + pw_raw = (row.get(power_col) or "").strip() + ts = _parse_timestamp(ts_raw) + pw = _parse_power(pw_raw) + if ts is None or pw is None: + continue + if ts < start_unix or ts > end_unix: + continue + bucket = round(ts, 3) + per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw + per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1 + if gpu_col: + gpu_id = (row.get(gpu_col) or "").strip() + if gpu_id: + ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id + per_sample_gpus.setdefault(bucket, set()).add(ns_id) + gpu_keys.add(ns_id) + except (OSError, csv.Error): + continue if not per_sample_total: return None # Per-sample divisor and overall num_gpus. - # - If a GPU column was detected, trust distinct GPU IDs (correct for any - # sampling pattern, including hot-swap or partial visibility). - # - Otherwise, infer from row count (one row per GPU per sample). - if gpu_col and gpu_keys: + # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs. + # - Otherwise, infer from row count (one row per GPU per sample, summed + # across all paths' rows that fell into the same timestamp bucket). + if saw_gpu_col and gpu_keys: num_gpus = len(gpu_keys) per_sample_mean_per_gpu = [ total / max(len(per_sample_gpus.get(ts, ())), 1) @@ -194,7 +213,16 @@ def _load_bench_window( bench_result_path: Path, ) -> tuple[float, float, float, int, int] | None: """Read (start_unix, end_unix, duration_s, total_output_tokens, total_input_tokens) - from the raw bench JSON. Returns None if any required field is missing. + from the raw bench JSON. Returns None if a window cannot be resolved. + + Window resolution order, tried in turn: + 1. benchmark_start_time_unix + benchmark_end_time_unix (our benchmark_serving.py + writes both — single-node, brackets the actual load window exactly). + 2. date + duration (srt-slurm sa-bench writes "YYYYMMDD-HHMMSS" UTC as the + result write time — multinode; treat as bench end and subtract duration + for start. Overshoots by post-bench JSON serialization, typically <5s). + 3. file mtime + duration (last resort if `date` is absent or unparseable — + same end-of-bench proxy as #2 via the result file's mtime). total_input_tokens defaults to 0 if absent (older bench JSONs may not have it); this only degrades joules_per_total_token to equal joules_per_output_token in @@ -204,18 +232,52 @@ def _load_bench_window( bench = json.loads(bench_result_path.read_text(encoding="utf-8")) except (OSError, json.JSONDecodeError): return None - start = bench.get("benchmark_start_time_unix") - end = bench.get("benchmark_end_time_unix") duration = bench.get("duration") total_output = bench.get("total_output_tokens") total_input = bench.get("total_input_tokens", 0) - if not all(isinstance(v, (int, float)) for v in (start, end, duration)): + if not isinstance(duration, (int, float)): return None if not isinstance(total_output, int) or total_output <= 0: return None if not isinstance(total_input, int) or total_input < 0: total_input = 0 - return float(start), float(end), float(duration), int(total_output), int(total_input) + + # Tier 1: explicit Unix timestamps (single-node bench_serving.py). + start = bench.get("benchmark_start_time_unix") + end = bench.get("benchmark_end_time_unix") + if isinstance(start, (int, float)) and isinstance(end, (int, float)): + return float(start), float(end), float(duration), int(total_output), int(total_input) + + # Tier 2: parse `date` field (srt-slurm sa-bench multinode). On observed + # runs the string matches file mtime to the second, confirming it's the + # JSON write time. + date_str = bench.get("date") + if isinstance(date_str, str): + try: + end_dt = datetime.strptime(date_str, "%Y%m%d-%H%M%S").replace(tzinfo=timezone.utc) + end_unix = end_dt.timestamp() + return ( + float(end_unix - duration), + float(end_unix), + float(duration), + int(total_output), + int(total_input), + ) + except ValueError: + pass + + # Tier 3: file mtime as last-resort bench-end proxy. + try: + end_unix = bench_result_path.stat().st_mtime + except OSError: + return None + return ( + float(end_unix - duration), + float(end_unix), + float(duration), + int(total_output), + int(total_input), + ) def patch_agg_result( @@ -234,7 +296,7 @@ def patch_agg_result( tmp_path.replace(agg_path) -def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int: +def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -> int: window = _load_bench_window(bench_result) if window is None: print( @@ -244,10 +306,12 @@ def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int: return 0 start, end, duration, total_output, total_input = window - result = aggregate_power(csv_path, start, end) + paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) + result = aggregate_power(paths, start, end) if result is None: + label = str(paths[0]) if len(paths) == 1 else f"{len(paths)} CSVs" print( - f"[aggregate_power] No usable power samples in {csv_path} for " + f"[aggregate_power] No usable power samples in {label} for " f"window [{start}, {end}] — skipping", file=sys.stderr, ) @@ -291,11 +355,20 @@ def run(csv_path: Path, bench_result: Path, agg_result: Path) -> int: def main() -> int: parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) - parser.add_argument( + source = parser.add_mutually_exclusive_group() + source.add_argument( "--csv", type=Path, - default=Path("/workspace/gpu_metrics.csv"), - help="Path to gpu_metrics.csv from start_gpu_monitor (default: /workspace/gpu_metrics.csv)", + default=None, + help="Single gpu_metrics.csv from start_gpu_monitor (single-node). " + "Falls back to /workspace/gpu_metrics.csv when neither --csv nor --csv-glob is set.", + ) + source.add_argument( + "--csv-glob", + type=str, + default=None, + help="Shell glob expanding to per-node perf_samples_*.csv files (multinode, " + "written by srt-slurm's perfmon). GPU indices are namespaced by source CSV stem.", ) parser.add_argument( "--bench-result", @@ -310,7 +383,17 @@ def main() -> int: help="Path to the agg_.json output of process_result.py (will be patched in place)", ) args = parser.parse_args() - return run(args.csv, args.bench_result, args.agg_result) + + if args.csv_glob: + paths = sorted(Path(p) for p in glob_module.glob(args.csv_glob)) + if not paths: + print( + f"[aggregate_power] No CSVs matched glob {args.csv_glob!r} — skipping", + file=sys.stderr, + ) + return 0 + return run(paths, args.bench_result, args.agg_result) + return run(args.csv or Path("/workspace/gpu_metrics.csv"), args.bench_result, args.agg_result) if __name__ == "__main__": diff --git a/utils/process_result.py b/utils/process_result.py index 5fb059473..0510fe023 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -139,20 +139,41 @@ def get_required_env_vars(required_vars): # Best-effort: patch measured power into the agg JSON. Never fails the run. try: + import glob as _glob_module from aggregate_power import run as _aggregate_power_run - _csv_candidates = [ - os.environ.get('GPU_METRICS_CSV'), - 'gpu_metrics.csv', - '/workspace/gpu_metrics.csv', - ] - _csv_path = next( - (Path(p) for p in _csv_candidates if p and Path(p).is_file()), - None, - ) - if _csv_path is not None: + # Multinode path: srt-slurm launchers set GPU_METRICS_CSV_GLOB after the job + # to a shell glob expanding to one perf_samples_.csv per worker node. + # Takes precedence over the single-CSV fallback — if the launcher set the + # glob, the run was multinode and there is no single-CSV fallback to make. + _csv_arg = None + _glob_pattern = os.environ.get('GPU_METRICS_CSV_GLOB') + if _glob_pattern: + _matched = sorted(Path(p) for p in _glob_module.glob(_glob_pattern)) + if _matched: + _csv_arg = _matched + else: + print( + f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files', + file=sys.stderr, + ) + + if _csv_arg is None: + # Single-node path: gpu_metrics.csv written by start_gpu_monitor in the + # bench container. + _csv_candidates = [ + os.environ.get('GPU_METRICS_CSV'), + 'gpu_metrics.csv', + '/workspace/gpu_metrics.csv', + ] + _csv_arg = next( + (Path(p) for p in _csv_candidates if p and Path(p).is_file()), + None, + ) + + if _csv_arg is not None: _aggregate_power_run( - csv_path=_csv_path, + csv_path=_csv_arg, bench_result=Path(f'{result_filename}.json'), agg_result=agg_path, ) diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index bf81ee7b1..b6f040ce8 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -15,7 +15,7 @@ import json import sys -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path import pytest @@ -445,3 +445,237 @@ def test_patch_agg_result_is_atomic_via_tempfile(tmp_path: Path): assert data["joules_per_total_token"] == 0.5 # No .tmp leftover. assert not (tmp_path / "agg.json.tmp").exists() + + +# --------------------------------------------------------------------------- # +# Multi-node CSV aggregation +# --------------------------------------------------------------------------- # + + +def test_aggregate_power_multi_node_namespaces_local_gpu_indices(tmp_path: Path): + """Two per-node CSVs each report local GPU indices 0..3. + + Without per-source namespacing the union of gpu_keys would collapse to 4 + instead of 8 — the bug this whole multinode change exists to prevent.""" + base = 1_700_000_000.0 + node1 = tmp_path / "perf_samples_node1.csv" + node2 = tmp_path / "perf_samples_node2.csv" + _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + _write_nvidia_csv(node2, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + result = aggregate_power([node1, node2], base, base + 10) + assert result is not None + avg_power, num_gpus = result + assert avg_power == pytest.approx(500.0) + assert num_gpus == 8 + + +def test_aggregate_power_multi_node_with_sub_second_clock_drift(tmp_path: Path): + """Per-node polls drift sub-second even on NTP-synced clusters. + + Node1 polls at base+s, node2 at base+s+0.3 — rows land in different ms + buckets. Each bucket is then a single-node 4-GPU slice averaging to 500W, + and the mean across all buckets is the cluster per-GPU mean.""" + base = 1_700_000_000.0 + node1 = tmp_path / "perf_samples_node1.csv" + node2 = tmp_path / "perf_samples_node2.csv" + _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + _write_nvidia_csv(node2, [(base + s + 0.3, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + result = aggregate_power([node1, node2], base, base + 10) + assert result is not None + avg_power, num_gpus = result + assert avg_power == pytest.approx(500.0) + assert num_gpus == 8 + + +def test_aggregate_power_multi_node_asymmetric_prefill_decode_power(tmp_path: Path): + """Disagg topologies draw different per-GPU power on prefill vs decode nodes. + + 4 prefill GPUs at 600W + 4 decode GPUs at 400W: cluster mean is the + weighted average across all 8 GPUs = (4*600 + 4*400)/8 = 500W.""" + base = 1_700_000_000.0 + prefill = tmp_path / "perf_samples_prefill0.csv" + decode = tmp_path / "perf_samples_decode0.csv" + _write_nvidia_csv(prefill, [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)]) + _write_nvidia_csv(decode, [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)]) + + result = aggregate_power([prefill, decode], base, base + 10) + assert result is not None + avg_power, num_gpus = result + assert avg_power == pytest.approx(500.0) + assert num_gpus == 8 + + +def test_aggregate_power_multi_node_skips_missing_csv_silently(tmp_path: Path): + """If a node failed to start perfmon, its CSV will be absent. + + Aggregating over the remaining nodes is preferable to returning None — + losing one node's power data should not zero out the whole metric.""" + base = 1_700_000_000.0 + present = tmp_path / "perf_samples_node1.csv" + missing = tmp_path / "perf_samples_node2.csv" # never written + _write_nvidia_csv(present, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + result = aggregate_power([present, missing], base, base + 10) + assert result is not None + avg_power, num_gpus = result + assert avg_power == pytest.approx(500.0) + assert num_gpus == 4 # only the node that emitted data + + +def test_aggregate_power_single_path_in_list_matches_bare_path(tmp_path: Path): + """Backward compat: aggregate_power([csv], ...) == aggregate_power(csv, ...). + + Single-source behavior must not change when the caller wraps the path in a + list — otherwise process_result.py-style callers that defensively normalize + to a list would see different num_gpus values than legacy bare-path calls.""" + base = 1_700_000_000.0 + csv = tmp_path / "gpu_metrics.csv" + _write_nvidia_csv(csv, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(8)]) + + bare = aggregate_power(csv, base, base + 10) + listed = aggregate_power([csv], base, base + 10) + assert bare == listed + assert bare == (pytest.approx(500.0), 8) + + +def test_aggregate_power_accepts_iterable_not_just_list(tmp_path: Path): + """Signature is Iterable[Path] — generators (e.g. Path.glob()) must work.""" + base = 1_700_000_000.0 + node1 = tmp_path / "perf_samples_node1.csv" + node2 = tmp_path / "perf_samples_node2.csv" + _write_nvidia_csv(node1, [(base + s, gpu, 500.0) for s in range(2) for gpu in range(4)]) + _write_nvidia_csv(node2, [(base + s, gpu, 500.0) for s in range(2) for gpu in range(4)]) + + result = aggregate_power(tmp_path.glob("perf_samples_*.csv"), base, base + 10) + assert result is not None + _, num_gpus = result + assert num_gpus == 8 + + +def test_run_multi_node_e2e_computes_joules_from_total_gpus(tmp_path: Path): + """End-to-end multinode: run() with a list of CSVs patches the agg JSON. + + 8 GPUs total at 500W for 10s → 40_000 J → 2.0 J/output_token for 20_000 tokens.""" + base = 1_700_000_000.0 + node1 = tmp_path / "perf_samples_node1.csv" + node2 = tmp_path / "perf_samples_node2.csv" + _write_nvidia_csv(node1, [(base + 1 + s, gpu, 500.0) for s in range(2) for gpu in range(4)]) + _write_nvidia_csv(node2, [(base + 1 + s, gpu, 500.0) for s in range(2) for gpu in range(4)]) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000) + agg.write_text(json.dumps({"hw": "gb300", "conc": 8192}), encoding="utf-8") + + exit_code = run([node1, node2], bench, agg) + assert exit_code == 0 + + patched = json.loads(agg.read_text()) + assert patched["avg_power_w"] == pytest.approx(500.0) + assert patched["joules_per_output_token"] == pytest.approx(2.0) + + +def test_run_multi_node_skips_when_all_csvs_missing(tmp_path: Path): + """Entire monitoring failure (all per-node CSVs absent) skips cleanly without patching.""" + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result(bench, start=0.0, end=10.0, duration=10.0, total_output=1000) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + exit_code = run([tmp_path / "absent1.csv", tmp_path / "absent2.csv"], bench, agg) + assert exit_code == 0 + + patched = json.loads(agg.read_text()) + assert "avg_power_w" not in patched + + +# --------------------------------------------------------------------------- # +# _load_bench_window fallbacks for srt-slurm multinode result JSONs +# +# srt-slurm's sa-bench result writer emits `date` + `duration` but NOT the +# benchmark_*_time_unix fields our single-node benchmark_serving.py adds. +# Without a fallback, multinode runs would always hit "No bench window in +# {bench_result}" and silently skip power aggregation end-to-end. +# --------------------------------------------------------------------------- # + + +def test_run_uses_date_field_when_unix_timestamps_absent(tmp_path: Path): + """Tier 2: parse `date` ("YYYYMMDD-HHMMSS" UTC) + `duration` for the window.""" + # End of bench at a known UTC instant; CSV samples land in [end-10, end]. + end_unix = datetime(2026, 5, 20, 3, 10, 29, tzinfo=timezone.utc).timestamp() + csv = tmp_path / "perf_samples_node0.csv" + _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + bench = tmp_path / "bench.json" + bench.write_text( + json.dumps( + { + "date": "20260520-031029", + "duration": 10.0, + "total_output_tokens": 1000, + "total_input_tokens": 8000, + } + ), + encoding="utf-8", + ) + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run([csv], bench, agg) == 0 + patched = json.loads(agg.read_text()) + assert patched["avg_power_w"] == pytest.approx(500.0) + # 4 GPUs × 500W × 10s = 20_000 J / 1000 output tokens = 20.0 J/output_token. + assert patched["joules_per_output_token"] == pytest.approx(20.0) + # 20_000 J / (1000 + 8000) total tokens ≈ 2.222 J/total_token. + assert patched["joules_per_total_token"] == pytest.approx(20_000 / 9_000) + + +def test_run_uses_mtime_when_date_unparseable(tmp_path: Path): + """Tier 3a: malformed `date` falls through to file mtime as bench-end proxy.""" + csv = tmp_path / "perf_samples_node0.csv" + bench = tmp_path / "bench.json" + bench.write_text( + json.dumps({"date": "not-a-date", "duration": 10.0, "total_output_tokens": 1000}), + encoding="utf-8", + ) + # CSV samples bracket bench file's mtime so they fall inside the derived window. + end_unix = bench.stat().st_mtime + _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + assert run([csv], bench, agg) == 0 + patched = json.loads(agg.read_text()) + assert patched["avg_power_w"] == pytest.approx(500.0) + + +def test_run_uses_mtime_when_no_date_field(tmp_path: Path): + """Tier 3b: bench JSON has only `duration` → file mtime is end-of-bench.""" + csv = tmp_path / "perf_samples_node0.csv" + bench = tmp_path / "bench.json" + bench.write_text( + json.dumps({"duration": 10.0, "total_output_tokens": 1000}), + encoding="utf-8", + ) + end_unix = bench.stat().st_mtime + _write_nvidia_csv(csv, [(end_unix - 1 - s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + assert run([csv], bench, agg) == 0 + patched = json.loads(agg.read_text()) + assert patched["avg_power_w"] == pytest.approx(500.0) + + +def test_run_skips_when_duration_missing(tmp_path: Path): + """No tier can resolve a window without `duration` — skip cleanly.""" + csv = tmp_path / "perf_samples_node0.csv" + _write_nvidia_csv(csv, [(1_700_000_000.0, 0, 400.0)]) + bench = tmp_path / "bench.json" + bench.write_text(json.dumps({"total_output_tokens": 1000}), encoding="utf-8") + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run([csv], bench, agg) == 0 + assert "avg_power_w" not in json.loads(agg.read_text()) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 4037689ea..61d3b45fc 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -649,3 +649,108 @@ def test_missing_bench_timestamps_does_not_patch(self, tmp_path, single_node_env patched = json.loads(agg_path.read_text()) assert "avg_power_w" not in patched assert "joules_per_output_token" not in patched + + def test_multinode_csv_glob_aggregates_across_per_node_csvs(self, tmp_path, single_node_env_vars): + """Multinode wiring: srt-slurm launchers set GPU_METRICS_CSV_GLOB to a + shell glob expanding to one perf_samples_.csv per worker node. + process_result.py must expand it and hand the list to the aggregator, + which namespaces local GPU indices per source so they don't collide. + + Without this bridge the launcher would set the env var, process_result.py + would ignore it (fall back to a non-existent /workspace/gpu_metrics.csv), + and the chart would silently show no power data — the failure mode that + motivated catching this in the contract check.""" + start, end = 1_700_000_100.0, 1_700_000_160.0 # 60s bench window + # Two per-node CSVs at the same local indices 0-3. Without per-source + # namespacing the union would collapse to 4 GPUs instead of 8. + self._write_nvidia_csv( + tmp_path / "perf_samples_node1.csv", start, end, watts_per_gpu=600.0, num_gpus=4 + ) + self._write_nvidia_csv( + tmp_path / "perf_samples_node2.csv", start, end, watts_per_gpu=600.0, num_gpus=4 + ) + + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 1000.0, + "output_throughput": 500.0, + "benchmark_start_time_unix": start, + "benchmark_end_time_unix": end, + "duration": 60.0, + "total_output_tokens": 30_000, + } + env = { + **single_node_env_vars, + "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"), + } + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + agg_path = tmp_path / "agg_benchmark_result.json" + patched = json.loads(agg_path.read_text()) + # 2 nodes × 4 GPUs = 8 total. Per-GPU mean stays at 600W. + assert patched["avg_power_w"] == pytest.approx(600.0, abs=0.5) + # 600W × 8 GPUs × 60s / 30_000 tokens = 9.6 J/tok. + # If namespacing failed we'd see ~4.8 (only 4 GPUs counted). + assert patched["joules_per_output_token"] == pytest.approx(9.6, abs=0.05) + + def test_multinode_csv_glob_takes_precedence_over_single_csv(self, tmp_path, single_node_env_vars): + """If both GLOB and single CSV are set, the glob wins. + + Reflects the ownership split: the multinode launcher sets the glob + after the job, while the single CSV env var is only meaningful for + single-node runs. If a stale single-CSV value leaks through (e.g. a + runner with persistent env), the glob should still take precedence.""" + start, end = 1_700_000_100.0, 1_700_000_160.0 + glob_csv = tmp_path / "perf_samples_node1.csv" + stale_csv = tmp_path / "stale_single.csv" + self._write_nvidia_csv(glob_csv, start, end, watts_per_gpu=600.0, num_gpus=4) + self._write_nvidia_csv(stale_csv, start, end, watts_per_gpu=100.0, num_gpus=1) + + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 1000.0, + "output_throughput": 500.0, + "benchmark_start_time_unix": start, + "benchmark_end_time_unix": end, + "duration": 60.0, + "total_output_tokens": 30_000, + } + env = { + **single_node_env_vars, + "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"), + "GPU_METRICS_CSV": str(stale_csv), + } + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + agg_path = tmp_path / "agg_benchmark_result.json" + patched = json.loads(agg_path.read_text()) + # Glob respected → 600W (4 GPUs). Stale fallback would give 100W (1 GPU). + assert patched["avg_power_w"] == pytest.approx(600.0, abs=0.5) + + def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, single_node_env_vars): + """If GPU_METRICS_CSV_GLOB is set but matches no files (perfmon failed + to start on any node), process_result.py still succeeds and writes the + agg JSON without power fields. The run must not block on telemetry.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 1000.0, + "output_throughput": 500.0, + } + env = { + **single_node_env_vars, + "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"), + } + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + agg_path = tmp_path / "agg_benchmark_result.json" + patched = json.loads(agg_path.read_text()) + assert "avg_power_w" not in patched From a9339df821d8865f57b737be84f34f7ab768faea Mon Sep 17 00:00:00 2001 From: Aryan Date: Wed, 27 May 2026 12:27:53 -0700 Subject: [PATCH 02/14] chore(perf-changelog): trigger multinode sweep for measured-power aggregation Appends entry for dsv4-fp4-gb300-dynamo-sglang so run-sweep.yml fires when the sweep-enabled label is added to PR #1574. The sweep produces the first multinode agg JSONs with avg_power_w + joules_per_*_token, validating the per-source GPU-id namespacing and GPU_METRICS_CSV_GLOB env-var bridge end-to-end on real GB300 hardware (gb300-cw cluster). --- perf-changelog.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index b7182a39c..01c7d2e81 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3192,3 +3192,9 @@ - "Add GLM-5-FP8 models.yaml flags, setup_deps.sh (aiter gluon + transformers glm_moe_dsa), GLM-5 env tuning in env.sh" - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang + description: + - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574 From eb2fa8e6c0f527f985013f001bab01005d646c22 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 10:39:52 -0700 Subject: [PATCH 03/14] fix(launcher): recurse subdirectories when injecting monitoring: into recipes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous glob `$SRT_RECIPE_DST/*.yaml` only matched top-level YAMLs, but recipes live under workload subdirectories (e.g. 8k1k/*.yaml). The loop iterated zero times, no recipe got the monitoring: block, perfmon never spawned, no perf_samples_*.csv were written, aggregate_power silently skipped patching the agg JSON, and the dashboard had no power data. Sweep #26548110246 burned hours of GB300 time and shipped "success" with zero power keys in every agg artifact — exactly the silent-failure chain we should have caught earlier. Fix: recurse via `find -type f -name '*.yaml'`. Add a loud WARNING when zero recipes get the injection so future regressions surface immediately instead of waiting for missing dashboard data to be noticed. --- runners/launch_gb300-cw.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 9f3222dad..951c350de 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -116,13 +116,24 @@ cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST" # orchestrator's _start_perf_monitor short-circuits and no perf_samples_*.csv # are ever written — multinode measured-power aggregation would silently # skip. Idempotent: skips recipes that already declare `monitoring:`. -for recipe in "$SRT_RECIPE_DST"/*.yaml; do - [ -f "$recipe" ] || continue +# +# CRITICAL: use `find` recursively, not a flat `*.yaml` glob. Recipes live +# in $SRT_RECIPE_DST//*.yaml (e.g. .../8k1k/*.yaml) — a flat glob +# matches zero files, the loop runs zero times, no recipe gets monitoring, +# and perfmon never spawns. PR #1574's first real sweep (#26548110246) hit +# exactly this: completed "success" with no power data because the glob +# matched nothing and the failure was silent end-to-end. +INJECTED_COUNT=0 +while IFS= read -r recipe; do if ! grep -q '^monitoring:' "$recipe"; then printf '\nmonitoring:\n enabled: true\n sample_interval: 1.0\n' >> "$recipe" echo "[perfmon] enabled monitoring in recipe: $recipe" + INJECTED_COUNT=$((INJECTED_COUNT + 1)) fi -done +done < <(find "$SRT_RECIPE_DST" -type f -name '*.yaml') +if [ "$INJECTED_COUNT" -eq 0 ]; then + echo "[perfmon] WARNING: zero recipes received monitoring injection under $SRT_RECIPE_DST. Either every recipe already had it, or the directory layout changed — power data will be MISSING from this run." >&2 +fi echo "Installing srtctl..." # CRITICAL — uv install location. From ddd71f3dddff9e51053972d7ed694fc27aecce46 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 10:40:48 -0700 Subject: [PATCH 04/14] ci: re-trigger sweep after srt-slurm fork rebase Previous Run Sweep failed because SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon was based on PR #35's head from 2026-04-13, predating the default_bash_preamble schema field that the launcher writes into srtslurm.yaml. srtctl rejected the config with 'Unknown field' and the job never submitted. Fork branch has now been rebased onto current NVIDIA/srt-slurm main (which has default_bash_preamble), with PR #35 perfmon commits + Aryan's role-label follow-up squashed/cherry-picked on top. Empty commit here re-fires the Run Sweep workflow so we can validate end-to-end on real DSv4 FP4 GB300. From 317049d9c0242219fed56ac651b413a274128000 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 10:44:34 -0700 Subject: [PATCH 05/14] chore(perf-changelog): re-trigger sweep after launcher recurse-glob fix Prior sweep (#26548110246) on SHA 8d303414 completed green but produced zero power data because of a flat-glob bug in the monitoring-injection loop. Fix is on HEAD (6da2f1b6) but the workflow's path filter only fires on perf-changelog.yaml edits, so this commit re-touches that file to re-dispatch. --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 01c7d2e81..9fdae2fd6 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3197,4 +3197,5 @@ - dsv4-fp4-gb300-dynamo-sglang description: - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)." + - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574 From 06558b9c82e3dfa78c230c79e61bddda3b8b1d18 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 11:07:15 -0700 Subject: [PATCH 06/14] feat(power): per-worker power + per-stage J/token for disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends multinode measured-power aggregation with per-worker breakdown and per-stage joules attribution. The cluster-wide avg_power_w + joules_per_*_token fields stay backward-compatible; new disagg-only fields layer on top. New agg JSON fields: - power_by_worker: list of {role, worker_idx, hosts, num_gpus, avg_power_w} parsed from srt-slurm perfmon CSV filenames (`perf_samples__w_.csv`). Roles: prefill, decode, agg, frontend. Workers spanning N nodes collapse one entry whose num_gpus is the cross-node sum. - joules_per_input_token: prefill_energy / total_input_tokens (disagg only — meaningless without a prefill stage). Per-stage attribution (disagg only) replaces cluster-wide ratios for existing fields: - joules_per_output_token = decode_energy / output_tokens - joules_per_total_token = (prefill + decode) / all_tokens Frontend-labeled CSVs are excluded from per-stage energy but still listed for observability. Falls back to cluster-wide math if only one stage's CSVs survived. process_result.py now passes DISAGG through to aggregate_power.run(). launch_gb300-cw.sh's recipe-injection loop reports found/injected counts so a zero-recipes-found bug is distinguishable from the benign all-already-monitored case. Tests: 88/88 pass (68 existing + 20 new). New coverage: label parsing across host formats, multi-node-per-worker collapse, per-stage J/token math, frontend exclusion, single-stage fallback, zero-input degenerate, end-to-end disagg wiring through process_result. Co-Authored-By: Claude Opus 4.7 --- runners/launch_gb300-cw.sh | 13 +- utils/aggregate_power.py | 406 +++++++++++++++++++++++++++------- utils/process_result.py | 19 +- utils/test_aggregate_power.py | 366 ++++++++++++++++++++++++++++++ utils/test_process_result.py | 91 ++++++++ 5 files changed, 803 insertions(+), 92 deletions(-) diff --git a/runners/launch_gb300-cw.sh b/runners/launch_gb300-cw.sh index 951c350de..eddb17b29 100644 --- a/runners/launch_gb300-cw.sh +++ b/runners/launch_gb300-cw.sh @@ -123,16 +123,25 @@ cp -rT "$SRT_RECIPE_SRC" "$SRT_RECIPE_DST" # and perfmon never spawns. PR #1574's first real sweep (#26548110246) hit # exactly this: completed "success" with no power data because the glob # matched nothing and the failure was silent end-to-end. +FOUND_COUNT=0 INJECTED_COUNT=0 while IFS= read -r recipe; do + FOUND_COUNT=$((FOUND_COUNT + 1)) if ! grep -q '^monitoring:' "$recipe"; then printf '\nmonitoring:\n enabled: true\n sample_interval: 1.0\n' >> "$recipe" echo "[perfmon] enabled monitoring in recipe: $recipe" INJECTED_COUNT=$((INJECTED_COUNT + 1)) fi done < <(find "$SRT_RECIPE_DST" -type f -name '*.yaml') -if [ "$INJECTED_COUNT" -eq 0 ]; then - echo "[perfmon] WARNING: zero recipes received monitoring injection under $SRT_RECIPE_DST. Either every recipe already had it, or the directory layout changed — power data will be MISSING from this run." >&2 +# Distinguish "found 0 recipes" (real bug — directory wrong/empty) from "all +# already had monitoring:" (benign — happens on reruns or if a recipe author +# pre-declared the block). Only the former is a missing-power-data risk. +if [ "$FOUND_COUNT" -eq 0 ]; then + echo "[perfmon] WARNING: zero recipe YAMLs found under $SRT_RECIPE_DST. The directory layout may have changed — power data will be MISSING from this run." >&2 +elif [ "$INJECTED_COUNT" -eq 0 ]; then + echo "[perfmon] all $FOUND_COUNT recipes already declared monitoring: — no injection needed." +else + echo "[perfmon] injected monitoring: into $INJECTED_COUNT of $FOUND_COUNT recipes." fi echo "Installing srtctl..." diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index ab6fcef3e..962c9167d 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -3,22 +3,38 @@ Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi) or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark load window using start/end Unix timestamps written by benchmark_serving.py, and -patches three keys into the aggregated result JSON consumed by InferenceX-app's -ETL: +patches the aggregated result JSON with cluster-wide and per-worker power data +consumed by InferenceX-app's ETL. +Cluster-wide fields (always written when any power data exists): - avg_power_w: mean per-GPU power draw (W) during the load window - - joules_per_output_token: (avg_power_w * num_gpus * duration_s) / total_output_tokens - - joules_per_total_token: same, divided by (input + output) tokens + - joules_per_output_token: energy / total_output_tokens + - joules_per_total_token: energy / (input + output) tokens + +For disaggregated multinode runs (DISAGG=true), the numerator for the J/token +metrics shifts to a per-stage attribution: prefill workers' energy is divided +by input tokens, decode workers' energy by output tokens. Per-stage power is +where the meaningful efficiency signal lives — total-energy ratios mostly just +re-scale the same number by different denominators. + + - joules_per_input_token: prefill_energy / total_input_tokens (disagg only) + - joules_per_output_token: decode_energy / total_output_tokens (overridden) + - joules_per_total_token: (prefill_energy + decode_energy) / total_tokens (overridden) + +Per-worker breakdown (multinode only — single-node has no role concept): + - power_by_worker: list of {role, worker_idx, hosts[], num_gpus, avg_power_w} + where role is "prefill", "decode", "agg", or "frontend". + +srt-slurm encodes the worker role and index in the perfmon CSV filename: +`perf_samples__w_.csv` — see srt-slurm fork's +benchmark_stage._start_perf_monitor. Filenames that don't match this pattern +(e.g. single-node `gpu_metrics.csv`) fall back to a single cluster-wide bucket. Multinode: accepts multiple CSV paths (one per worker node). GPU indices are namespaced by source CSV stem to avoid the same-index collision across nodes — e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4 total GPUs instead of 32. -The ETL (`packages/db/src/etl/benchmark-mapper.ts`) auto-captures any numeric -field in the agg JSON into the `metrics` JSONB column, so no schema migration -is required. - Vendor schema detection is regex-based: any timestamp-like column + any column whose name contains "power" (excluding "limit"/"cap"/"max") is picked up. NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's @@ -48,6 +64,14 @@ _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE) _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?") +# srt-slurm perfmon filename: perf_samples__w_.csv +# Roles: prefill, decode, agg, frontend (see srt-slurm benchmark_stage._label). +# Host may contain hyphens and digits; greedy `.+` is fine because the `_w_` +# anchor is unambiguous. +_PERFMON_LABEL_RE = re.compile( + r"^perf_samples_(?Pprefill|decode|agg|frontend)_w(?P\d+)_(?P.+)$" +) + def _parse_timestamp(value: str) -> float | None: """Best-effort timestamp parse to Unix epoch seconds (local wall clock). @@ -117,85 +141,90 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No return timestamp_col, power_col, gpu_col -def aggregate_power( - csv_path: Path | Iterable[Path], - start_unix: float, - end_unix: float, -) -> tuple[float, int] | None: - """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end]. +def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None: + """Extract (role, worker_idx, host) from a srt-slurm perfmon CSV filename. - Accepts either a single Path (single-node case) or an iterable of Paths - (multinode case: one CSV per worker node, all written by srt-slurm's - perfmon). For multi-path inputs, GPU indices are namespaced by source - CSV stem so the distinct-id count reflects the true total — each node - independently reports indices 0..N, and without namespacing the union - would collapse to a single node's worth. + Returns None for filenames not matching the perfmon pattern (e.g. + single-node `gpu_metrics.csv`). Used to group node-level CSVs by the + worker(s) running on each node. + """ + m = _PERFMON_LABEL_RE.match(path.stem) + if not m: + return None + return m.group("role"), int(m.group("idx")), m.group("host") - Returns None if no CSVs are usable, none have a detectable power column, - or no rows fall in the window across all paths. + +def _read_samples( + path: Path, start_unix: float, end_unix: float +) -> tuple[list[tuple[float, float, str | None]], bool] | None: + """Read one CSV → list of (timestamp_bucket, power_w, gpu_id) in window. + + Returns (rows, saw_gpu_col) on success, None if the file is unreadable / + missing the required columns. Empty rows list is valid (file readable but + no samples landed in the window). """ - paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) - if not paths or end_unix <= start_unix: + if not path.is_file() or path.stat().st_size == 0: return None + try: + with path.open("r", newline="", encoding="utf-8", errors="replace") as f: + reader = csv.DictReader(f, skipinitialspace=True) + header = [c.strip() for c in (reader.fieldnames or [])] + reader.fieldnames = header + timestamp_col, power_col, gpu_col = _detect_columns(header) + if not timestamp_col or not power_col: + return None + rows: list[tuple[float, float, str | None]] = [] + for row in reader: + ts = _parse_timestamp((row.get(timestamp_col) or "").strip()) + pw = _parse_power((row.get(power_col) or "").strip()) + if ts is None or pw is None: + continue + if ts < start_unix or ts > end_unix: + continue + gpu_id = (row.get(gpu_col) or "").strip() if gpu_col else None + rows.append((round(ts, 3), pw, gpu_id or None)) + return rows, gpu_col is not None + except (OSError, csv.Error): + return None + - # Only namespace when there are multiple sources — keeps single-node - # gpu_keys identical to the pre-multinode behavior so existing callers - # see the same num_gpus values. - namespace = len(paths) > 1 +def _aggregate_rows( + sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]], + *, + namespace: bool, +) -> tuple[float, int] | None: + """Merge rows across CSVs into (per_gpu_avg_power_w, num_gpus). - # Per-sample state accumulates across ALL paths. Bucketed by ms-rounded - # timestamp so nodes whose clocks drift sub-ms still end up in the same - # bucket (they reliably do — all sample on `time.sleep(interval)` against - # the same NTP-synced cluster clock). + `sources` is a list of (path, rows, saw_gpu_col) for the CSVs to roll up + together. Rows are bucketed by ms-rounded timestamp so nodes with sub-ms + clock drift land in the same bucket. GPU indices are namespaced by the + source path's stem when `namespace=True` (multi-source case) to keep + same-local-index across nodes from collapsing. + """ per_sample_total: dict[float, float] = {} per_sample_row_count: dict[float, int] = {} per_sample_gpus: dict[float, set[str]] = {} gpu_keys: set[str] = set() - saw_gpu_col = False - - for path in paths: - if not path.is_file() or path.stat().st_size == 0: - continue - try: - with path.open("r", newline="", encoding="utf-8", errors="replace") as f: - reader = csv.DictReader(f, skipinitialspace=True) - header = [c.strip() for c in (reader.fieldnames or [])] - reader.fieldnames = header - timestamp_col, power_col, gpu_col = _detect_columns(header) - if not timestamp_col or not power_col: - continue - if gpu_col: - saw_gpu_col = True - - for row in reader: - ts_raw = (row.get(timestamp_col) or "").strip() - pw_raw = (row.get(power_col) or "").strip() - ts = _parse_timestamp(ts_raw) - pw = _parse_power(pw_raw) - if ts is None or pw is None: - continue - if ts < start_unix or ts > end_unix: - continue - bucket = round(ts, 3) - per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw - per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1 - if gpu_col: - gpu_id = (row.get(gpu_col) or "").strip() - if gpu_id: - ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id - per_sample_gpus.setdefault(bucket, set()).add(ns_id) - gpu_keys.add(ns_id) - except (OSError, csv.Error): - continue + saw_gpu_col_any = False + + for path, rows, saw_gpu_col in sources: + if saw_gpu_col: + saw_gpu_col_any = True + for bucket, pw, gpu_id in rows: + per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw + per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1 + if gpu_id is not None: + ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id + per_sample_gpus.setdefault(bucket, set()).add(ns_id) + gpu_keys.add(ns_id) if not per_sample_total: return None - # Per-sample divisor and overall num_gpus. # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs. # - Otherwise, infer from row count (one row per GPU per sample, summed # across all paths' rows that fell into the same timestamp bucket). - if saw_gpu_col and gpu_keys: + if saw_gpu_col_any and gpu_keys: num_gpus = len(gpu_keys) per_sample_mean_per_gpu = [ total / max(len(per_sample_gpus.get(ts, ())), 1) @@ -209,6 +238,109 @@ def aggregate_power( return mean(per_sample_mean_per_gpu), num_gpus +def aggregate_power( + csv_path: Path | Iterable[Path], + start_unix: float, + end_unix: float, +) -> tuple[float, int] | None: + """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end]. + + Accepts either a single Path (single-node case) or an iterable of Paths + (multinode case: one CSV per worker node, all written by srt-slurm's + perfmon). For multi-path inputs, GPU indices are namespaced by source + CSV stem so the distinct-id count reflects the true total — each node + independently reports indices 0..N, and without namespacing the union + would collapse to a single node's worth. + + Returns None if no CSVs are usable, none have a detectable power column, + or no rows fall in the window across all paths. + """ + paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) + if not paths or end_unix <= start_unix: + return None + + sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = [] + for path in paths: + read = _read_samples(path, start_unix, end_unix) + if read is None: + continue + rows, saw_gpu_col = read + sources.append((path, rows, saw_gpu_col)) + if not sources: + return None + + return _aggregate_rows(sources, namespace=len(paths) > 1) + + +def aggregate_power_by_worker( + csv_paths: Iterable[Path], + start_unix: float, + end_unix: float, +) -> list[dict] | None: + """Group CSVs by (role, worker_idx) and return per-worker power rollups. + + Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w}. + Returns None if no CSVs have parseable filenames OR no labeled CSV yields + usable samples. Unlabeled CSVs in the input are silently skipped — they + can't be attributed to a worker. + + Hosts are listed because a single worker can span multiple nodes (e.g. + a 16-GPU decode worker over 4 nodes, all labeled decode_w0_). + Multiple node-CSVs sharing the same (role, worker_idx) collapse into one + worker entry whose num_gpus is the sum across nodes. + """ + paths = list(csv_paths) + if not paths or end_unix <= start_unix: + return None + + # Group paths by (role, worker_idx); discard unlabeled. + by_worker: dict[tuple[str, int], list[Path]] = {} + hosts_by_worker: dict[tuple[str, int], set[str]] = {} + for p in paths: + label = _parse_perfmon_label(p) + if label is None: + continue + role, worker_idx, host = label + key = (role, worker_idx) + by_worker.setdefault(key, []).append(p) + hosts_by_worker.setdefault(key, set()).add(host) + if not by_worker: + return None + + out: list[dict] = [] + for (role, worker_idx), worker_paths in by_worker.items(): + sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = [] + for path in worker_paths: + read = _read_samples(path, start_unix, end_unix) + if read is None: + continue + rows, saw_gpu_col = read + sources.append((path, rows, saw_gpu_col)) + if not sources: + continue + # Namespace across paths within a worker too — a 16-GPU decode worker + # spans 4 nodes, each reporting local indices 0..3. + result = _aggregate_rows(sources, namespace=len(sources) > 1) + if result is None: + continue + avg_power_w, num_gpus = result + out.append( + { + "role": role, + "worker_idx": worker_idx, + "hosts": sorted(hosts_by_worker[(role, worker_idx)]), + "num_gpus": num_gpus, + "avg_power_w": round(avg_power_w, 3), + } + ) + if not out: + return None + # Stable order: role (prefill < decode < agg < frontend), then worker_idx. + role_order = {"prefill": 0, "decode": 1, "agg": 2, "frontend": 3} + out.sort(key=lambda w: (role_order.get(w["role"], 99), w["worker_idx"])) + return out + + def _load_bench_window( bench_result_path: Path, ) -> tuple[float, float, float, int, int] | None: @@ -285,18 +417,63 @@ def patch_agg_result( avg_power_w: float, joules_per_output_token: float, joules_per_total_token: float, + joules_per_input_token: float | None = None, + power_by_worker: list[dict] | None = None, ) -> None: - """Read the agg JSON, add the three power keys, and write it back atomically.""" + """Read the agg JSON, add the power keys, and write it back atomically. + + `joules_per_input_token` and `power_by_worker` are optional — omitted from + the JSON when None (kept that way so single-node and non-disagg multinode + agg JSONs don't gain meaningless null fields). + """ data = json.loads(agg_path.read_text(encoding="utf-8")) data["avg_power_w"] = round(avg_power_w, 3) data["joules_per_output_token"] = round(joules_per_output_token, 6) data["joules_per_total_token"] = round(joules_per_total_token, 6) + if joules_per_input_token is not None: + data["joules_per_input_token"] = round(joules_per_input_token, 6) + if power_by_worker is not None: + data["power_by_worker"] = power_by_worker tmp_path = agg_path.with_suffix(agg_path.suffix + ".tmp") tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8") tmp_path.replace(agg_path) -def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) -> int: +def _disagg_stage_energies( + power_by_worker: list[dict], duration: float +) -> tuple[float, float] | None: + """Sum per-worker energy for prefill vs decode workers (J). + + Returns (prefill_energy_j, decode_energy_j) or None if either stage is + absent — without both stages we can't do per-stage attribution and the + caller should fall back to total-energy math. + """ + prefill_e = 0.0 + decode_e = 0.0 + has_prefill = False + has_decode = False + for w in power_by_worker: + e = w["avg_power_w"] * w["num_gpus"] * duration + if w["role"] == "prefill": + prefill_e += e + has_prefill = True + elif w["role"] == "decode": + decode_e += e + has_decode = True + # "frontend" / "agg" / unknown roles deliberately excluded — they + # don't belong to either stage's per-token cost. + if not (has_prefill and has_decode): + return None + return prefill_e, decode_e + + +def run( + csv_path: Path | Iterable[Path], + bench_result: Path, + agg_result: Path, + *, + disagg: bool = False, +) -> int: window = _load_bench_window(bench_result) if window is None: print( @@ -318,15 +495,51 @@ def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) - return 0 avg_power_w, num_gpus = result - # Joules consumed by the system during the bench window, divided by either - # output tokens (for generation-cost metrics) or all tokens (for whole- - # workload efficiency). + # Per-worker rollup is best-effort: only emitted when CSV filenames carry + # the perfmon role/index encoding. Single-node `gpu_metrics.csv` won't + # parse, so aggregate_power_by_worker returns None and the field is omitted. + power_by_worker = aggregate_power_by_worker(paths, start, end) + + # Cluster-wide energy baseline. Used as the fallback numerator when + # per-stage attribution isn't available. total_system_energy_j = avg_power_w * num_gpus * duration - joules_per_output_token = total_system_energy_j / total_output total_tokens = total_output + total_input - joules_per_total_token = ( - total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token - ) + + joules_per_input_token: float | None = None + + if disagg and power_by_worker is not None: + stage = _disagg_stage_energies(power_by_worker, duration) + if stage is not None: + prefill_energy_j, decode_energy_j = stage + # Per-stage attribution: prefill workers process input tokens, + # decode workers process output tokens. Strictly more accurate + # than total-energy ratios when prefill/decode have different + # per-GPU power profiles (typical: prefill is compute-bound and + # draws more than memory-bound decode). + joules_per_output_token = decode_energy_j / total_output + joules_per_input_token = ( + prefill_energy_j / total_input if total_input > 0 else None + ) + joules_per_total_token = ( + (prefill_energy_j + decode_energy_j) / total_tokens + if total_tokens > 0 + else joules_per_output_token + ) + else: + # disagg=true but workers don't split into prefill+decode (e.g. + # only one role's CSVs survived). Fall back to cluster math. + joules_per_output_token = total_system_energy_j / total_output + joules_per_total_token = ( + total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token + ) + else: + # Single-node or non-disagg multinode: keep the cluster-wide ratios + # backward-compatible with everything that consumed the pre-disagg + # schema. + joules_per_output_token = total_system_energy_j / total_output + joules_per_total_token = ( + total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token + ) if not agg_result.is_file(): print( @@ -337,18 +550,32 @@ def run(csv_path: Path | Iterable[Path], bench_result: Path, agg_result: Path) - try: patch_agg_result( - agg_result, avg_power_w, joules_per_output_token, joules_per_total_token + agg_result, + avg_power_w, + joules_per_output_token, + joules_per_total_token, + joules_per_input_token=joules_per_input_token, + power_by_worker=power_by_worker, ) except (OSError, json.JSONDecodeError) as exc: print(f"[aggregate_power] Failed to patch {agg_result}: {exc}", file=sys.stderr) return 0 + worker_summary = ( + f"workers={len(power_by_worker)}" if power_by_worker else "workers=cluster-only" + ) + jpit_summary = ( + f"joules_per_input_token={joules_per_input_token:.4f} " + if joules_per_input_token is not None + else "" + ) print( f"[aggregate_power] avg_power_w={avg_power_w:.2f} (per GPU, n={num_gpus}) " f"joules_per_output_token={joules_per_output_token:.4f} " + f"{jpit_summary}" f"joules_per_total_token={joules_per_total_token:.4f} " f"duration={duration:.1f}s output_tokens={total_output} input_tokens={total_input} " - f"-> {agg_result}" + f"{worker_summary} -> {agg_result}" ) return 0 @@ -382,6 +609,14 @@ def main() -> int: required=True, help="Path to the agg_.json output of process_result.py (will be patched in place)", ) + parser.add_argument( + "--disagg", + action="store_true", + help="Treat as disaggregated inference: emit joules_per_input_token using " + "per-stage energy attribution (prefill workers' energy / input tokens, " + "decode workers' energy / output tokens). Requires CSV filenames to carry " + "the perfmon role/index encoding.", + ) args = parser.parse_args() if args.csv_glob: @@ -392,8 +627,13 @@ def main() -> int: file=sys.stderr, ) return 0 - return run(paths, args.bench_result, args.agg_result) - return run(args.csv or Path("/workspace/gpu_metrics.csv"), args.bench_result, args.agg_result) + return run(paths, args.bench_result, args.agg_result, disagg=args.disagg) + return run( + args.csv or Path("/workspace/gpu_metrics.csv"), + args.bench_result, + args.agg_result, + disagg=args.disagg, + ) if __name__ == "__main__": diff --git a/utils/process_result.py b/utils/process_result.py index 0510fe023..3413d5e77 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -142,23 +142,27 @@ def get_required_env_vars(required_vars): import glob as _glob_module from aggregate_power import run as _aggregate_power_run - # Multinode path: srt-slurm launchers set GPU_METRICS_CSV_GLOB after the job - # to a shell glob expanding to one perf_samples_.csv per worker node. - # Takes precedence over the single-CSV fallback — if the launcher set the - # glob, the run was multinode and there is no single-CSV fallback to make. + # Two mutually-exclusive sources, decided up front. If GPU_METRICS_CSV_GLOB + # is set, the run is multinode (the launcher set it deliberately) and we + # MUST NOT fall back to single-CSV — a stale gpu_metrics.csv left over from + # a previous single-node run on the same runner pod would silently publish + # wrong power numbers for the multinode run. _csv_arg = None _glob_pattern = os.environ.get('GPU_METRICS_CSV_GLOB') if _glob_pattern: + # Multinode path: glob to per-node perf_samples__w_.csv. _matched = sorted(Path(p) for p in _glob_module.glob(_glob_pattern)) if _matched: _csv_arg = _matched else: print( - f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files', + f'[process_result] GPU_METRICS_CSV_GLOB={_glob_pattern!r} matched no files ' + f'— skipping power aggregation (NOT falling back to single-CSV: the launcher ' + f'set the glob, indicating a multinode run; any single-CSV present would be ' + f'stale single-node data)', file=sys.stderr, ) - - if _csv_arg is None: + else: # Single-node path: gpu_metrics.csv written by start_gpu_monitor in the # bench container. _csv_candidates = [ @@ -176,6 +180,7 @@ def get_required_env_vars(required_vars): csv_path=_csv_arg, bench_result=Path(f'{result_filename}.json'), agg_result=agg_path, + disagg=disagg, ) except Exception as exc: # noqa: BLE001 — never block on telemetry print(f'[process_result] power aggregation skipped: {exc}', file=sys.stderr) diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index b6f040ce8..ed6ca69ab 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -24,9 +24,11 @@ from aggregate_power import ( # noqa: E402 _detect_columns, + _parse_perfmon_label, _parse_power, _parse_timestamp, aggregate_power, + aggregate_power_by_worker, patch_agg_result, run, ) @@ -679,3 +681,367 @@ def test_run_skips_when_duration_missing(tmp_path: Path): assert run([csv], bench, agg) == 0 assert "avg_power_w" not in json.loads(agg.read_text()) + + +# --------------------------------------------------------------------------- # +# Perfmon filename label parsing — drives per-worker grouping +# --------------------------------------------------------------------------- # + + +def test_parse_perfmon_label_prefill(tmp_path: Path): + role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_prefill_w0_node1.csv") + assert (role, idx, host) == ("prefill", 0, "node1") + + +def test_parse_perfmon_label_decode_high_worker_idx(tmp_path: Path): + """Worker index can be multi-digit (e.g. 16-way prefill).""" + role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_decode_w15_node-42.csv") + assert (role, idx, host) == ("decode", 15, "node-42") + + +def test_parse_perfmon_label_host_with_hyphens_and_digits(tmp_path: Path): + """CoreWeave-style hostnames like `slurm-compute-gpu-019-42b` must round-trip.""" + role, idx, host = _parse_perfmon_label( + tmp_path / "perf_samples_prefill_w3_slurm-compute-gpu-019-42b.csv" + ) + assert (role, idx, host) == ("prefill", 3, "slurm-compute-gpu-019-42b") + + +def test_parse_perfmon_label_agg_role(tmp_path: Path): + """Non-disagg multinode uses role='agg' (not prefill/decode).""" + role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_agg_w0_node1.csv") + assert (role, idx, host) == ("agg", 0, "node1") + + +def test_parse_perfmon_label_frontend_role(tmp_path: Path): + """Head-only nodes (no backend workers) get role='frontend'.""" + role, idx, host = _parse_perfmon_label(tmp_path / "perf_samples_frontend_w0_head.csv") + assert (role, idx, host) == ("frontend", 0, "head") + + +def test_parse_perfmon_label_unlabeled_returns_none(tmp_path: Path): + """Single-node `gpu_metrics.csv` doesn't match — caller should treat as None.""" + assert _parse_perfmon_label(tmp_path / "gpu_metrics.csv") is None + assert _parse_perfmon_label(tmp_path / "perf_samples_node1.csv") is None + assert _parse_perfmon_label(tmp_path / "perf_samples_unknownrole_w0_host.csv") is None + + +# --------------------------------------------------------------------------- # +# Per-worker aggregation — groups node-CSVs by (role, worker_idx) +# --------------------------------------------------------------------------- # + + +def test_aggregate_power_by_worker_one_csv_per_worker(tmp_path: Path): + """4 prefill workers (one per node) + 1 decode worker on a single node. + + Reflects the smallest disagg topology — every CSV is its own worker.""" + base = 1_700_000_000.0 + for w in range(4): + _write_nvidia_csv( + tmp_path / f"perf_samples_prefill_w{w}_pnode{w}.csv", + [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)], + ) + _write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dnode0.csv", + [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)], + ) + + workers = aggregate_power_by_worker( + list(tmp_path.glob("perf_samples_*.csv")), base, base + 10 + ) + assert workers is not None + # Ordered: prefill (w0..w3), then decode (w0). + assert [w["role"] for w in workers] == ["prefill"] * 4 + ["decode"] + assert [w["worker_idx"] for w in workers] == [0, 1, 2, 3, 0] + # Each worker is 4 GPUs at its respective wattage. + for w in workers[:4]: + assert w["num_gpus"] == 4 + assert w["avg_power_w"] == pytest.approx(600.0) + assert len(w["hosts"]) == 1 + assert workers[4]["num_gpus"] == 4 + assert workers[4]["avg_power_w"] == pytest.approx(400.0) + + +def test_aggregate_power_by_worker_one_worker_spans_multiple_nodes(tmp_path: Path): + """Decode_w0 spans 4 nodes × 4 GPUs = 16 GPUs. + + Mirrors the typical wide-EP DSV4 topology (gpus_per_decode=16, + decode_workers=1). All 4 node-CSVs share the same (role, worker_idx) + and must collapse into ONE worker entry with num_gpus=16.""" + base = 1_700_000_000.0 + hosts = ["dnode0", "dnode1", "dnode2", "dnode3"] + for h in hosts: + _write_nvidia_csv( + tmp_path / f"perf_samples_decode_w0_{h}.csv", + [(base + s, gpu, 400.0) for s in range(3) for gpu in range(4)], + ) + + workers = aggregate_power_by_worker( + list(tmp_path.glob("perf_samples_*.csv")), base, base + 10 + ) + assert workers is not None + assert len(workers) == 1 + w = workers[0] + assert w["role"] == "decode" + assert w["worker_idx"] == 0 + assert w["num_gpus"] == 16 # 4 nodes × 4 GPUs + assert w["avg_power_w"] == pytest.approx(400.0) + assert w["hosts"] == sorted(hosts) + + +def test_aggregate_power_by_worker_returns_none_when_no_labels(tmp_path: Path): + """Single-node `gpu_metrics.csv` has no perfmon label — returns None. + + Caller (run()) then omits power_by_worker from the agg JSON entirely.""" + base = 1_700_000_000.0 + csv = tmp_path / "gpu_metrics.csv" + _write_nvidia_csv(csv, [(base + s, gpu, 500.0) for s in range(3) for gpu in range(4)]) + assert aggregate_power_by_worker([csv], base, base + 10) is None + + +def test_aggregate_power_by_worker_returns_none_for_empty_input(tmp_path: Path): + assert aggregate_power_by_worker([], 0.0, 100.0) is None + + +def test_aggregate_power_by_worker_skips_unlabeled_silently(tmp_path: Path): + """Mixed input: one labeled CSV + one unlabeled. Only labeled is grouped.""" + base = 1_700_000_000.0 + labeled = tmp_path / "perf_samples_prefill_w0_n1.csv" + unlabeled = tmp_path / "gpu_metrics.csv" + _write_nvidia_csv(labeled, [(base + s, gpu, 600.0) for s in range(3) for gpu in range(4)]) + _write_nvidia_csv(unlabeled, [(base + s, gpu, 999.0) for s in range(3) for gpu in range(4)]) + + workers = aggregate_power_by_worker([labeled, unlabeled], base, base + 10) + assert workers is not None + assert len(workers) == 1 + assert workers[0]["role"] == "prefill" + # Unlabeled CSV's wattage must not bleed into the prefill worker. + assert workers[0]["avg_power_w"] == pytest.approx(600.0) + + +# --------------------------------------------------------------------------- # +# End-to-end disagg: run(..., disagg=True) emits per-worker + per-stage J/token +# --------------------------------------------------------------------------- # + + +def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path): + """Full disagg pipeline: per-worker breakdown + per-stage J/input + J/output. + + Topology: 2 prefill workers × 4 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W. + Over a 10s bench window with 8000 input + 1000 output tokens: + - prefill energy = 600 × 8 × 10 = 48_000 J → J/input = 48_000 / 8000 = 6.0 + - decode energy = 400 × 8 × 10 = 32_000 J → J/output = 32_000 / 1000 = 32.0 + - total energy = 80_000 J → J/total = 80_000 / 9000 ≈ 8.889 + Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs.""" + base = 1_700_000_000.0 + _write_nvidia_csv( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)], + ) + _write_nvidia_csv( + tmp_path / "perf_samples_prefill_w1_pn1.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)], + ) + _write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dn0.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)], + ) + _write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dn1.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)], + ) + + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, + start=base, + end=base + 10, + duration=10.0, + total_output=1000, + total_input=8000, + ) + agg.write_text(json.dumps({"hw": "gb300", "disagg": True}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + + # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W. + assert patched["avg_power_w"] == pytest.approx(500.0) + + # Per-stage J/token: prefill energy / input, decode energy / output. + assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 + assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000) # 32.0 + assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 + + workers = patched["power_by_worker"] + assert [w["role"] for w in workers] == ["prefill", "prefill", "decode"] + assert [w["worker_idx"] for w in workers] == [0, 1, 0] + # Decode_w0 collapsed across 2 hosts → 8 GPUs total. + decode = workers[2] + assert decode["num_gpus"] == 8 + assert decode["avg_power_w"] == pytest.approx(400.0) + assert decode["hosts"] == ["dn0", "dn1"] + # Each prefill worker is one node, 4 GPUs. + for w in workers[:2]: + assert w["num_gpus"] == 4 + assert w["avg_power_w"] == pytest.approx(600.0) + assert len(w["hosts"]) == 1 + + +def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path): + """A frontend-only node's power must not contribute to J/input or J/output. + + Frontend nodes don't run any backend worker — their (typically near-idle) + GPU draw would skew per-stage attribution if counted. They still appear + in power_by_worker for observability.""" + base = 1_700_000_000.0 + # Prefill worker — 4 GPUs @ 600W → 24_000 J in 10s + _write_nvidia_csv( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)], + ) + # Decode worker — 4 GPUs @ 400W → 16_000 J + _write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dn0.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)], + ) + # Frontend node — would erroneously add 4_000 J if counted. + _write_nvidia_csv( + tmp_path / "perf_samples_frontend_w0_head.csv", + [(base + 1 + s, gpu, 100.0) for s in range(8) for gpu in range(4)], + ) + + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=8000 + ) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + + # J/input = 24_000 / 8000 = 3.0 (frontend excluded). + assert patched["joules_per_input_token"] == pytest.approx(3.0) + # J/output = 16_000 / 1000 = 16.0 (frontend excluded). + assert patched["joules_per_output_token"] == pytest.approx(16.0) + # Frontend still appears in the worker list for observability. + roles = [w["role"] for w in patched["power_by_worker"]] + assert "frontend" in roles + + +def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path): + """Non-disagg runs (single-node or multinode-agg) keep the legacy schema. + + No joules_per_input_token field — it'd be meaningless without a prefill + stage to attribute energy to. Existing fields must keep their pre-disagg + semantics (total_system_energy / token_count).""" + base = 1_700_000_000.0 + csv = tmp_path / "gpu_metrics.csv" + _write_nvidia_csv( + csv, [(base + 1 + s, gpu, 500.0) for s in range(8) for gpu in range(8)] + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, total_output=20_000 + ) + agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8") + + assert run(csv, bench, agg, disagg=False) == 0 + patched = json.loads(agg.read_text()) + assert "joules_per_input_token" not in patched + assert "power_by_worker" not in patched + # Legacy semantics: total energy / token count. + assert patched["joules_per_output_token"] == pytest.approx(2.0) + assert patched["joules_per_total_token"] == pytest.approx(2.0) + + +def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path: Path): + """If only prefill or only decode CSVs survived, per-stage attribution + isn't possible — must fall back to cluster-wide ratios so the run still + publishes something useful instead of dropping the field entirely.""" + base = 1_700_000_000.0 + # Only prefill CSVs — decode is missing entirely. + _write_nvidia_csv( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)], + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=8000 + ) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + # power_by_worker still emitted (one prefill worker). + assert len(patched["power_by_worker"]) == 1 + # J/input absent (no per-stage attribution possible). + assert "joules_per_input_token" not in patched + # J/output falls back to cluster-wide (total_energy / output_tokens). + assert patched["joules_per_output_token"] == pytest.approx(24_000 / 1000) + + +def test_run_disagg_handles_zero_input_tokens(tmp_path: Path): + """total_input_tokens=0 (rare degenerate case) → joules_per_input_token + omitted, no ZeroDivisionError.""" + base = 1_700_000_000.0 + _write_nvidia_csv( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(4)], + ) + _write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dn0.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)], + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, total_output=1000, total_input=0 + ) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + assert "joules_per_input_token" not in patched + assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000) + + +def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path): + """patch_agg_result emits the new optional fields when supplied.""" + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + workers = [ + {"role": "prefill", "worker_idx": 0, "hosts": ["pn0"], "num_gpus": 4, "avg_power_w": 600.0}, + {"role": "decode", "worker_idx": 0, "hosts": ["dn0"], "num_gpus": 4, "avg_power_w": 400.0}, + ] + patch_agg_result( + agg, + avg_power_w=500.0, + joules_per_output_token=16.0, + joules_per_total_token=4.44, + joules_per_input_token=3.0, + power_by_worker=workers, + ) + data = json.loads(agg.read_text()) + assert data["avg_power_w"] == 500.0 + assert data["joules_per_input_token"] == 3.0 + assert data["power_by_worker"] == workers + + +def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path): + """Backward compat: caller passing None for new fields → fields absent.""" + agg = tmp_path / "agg.json" + agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8") + patch_agg_result( + agg, + avg_power_w=400.0, + joules_per_output_token=1.5, + joules_per_total_token=0.5, + ) + data = json.loads(agg.read_text()) + assert "joules_per_input_token" not in data + assert "power_by_worker" not in data diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 61d3b45fc..6b3fc9a94 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -754,3 +754,94 @@ def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, s agg_path = tmp_path / "agg_benchmark_result.json" patched = json.loads(agg_path.read_text()) assert "avg_power_w" not in patched + + def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path, multinode_env_vars): + """End-to-end disagg wiring: DISAGG=true + per-node labeled CSVs → + process_result.py passes disagg through to aggregate_power, which emits + power_by_worker + joules_per_input_token using per-stage attribution. + + Without the disagg=disagg propagation in process_result.py, the run + would silently fall back to cluster-wide joules math and the user-facing + per-stage J/input metric would be missing.""" + start, end = 1_700_000_100.0, 1_700_000_160.0 # 60s bench window + # 1 prefill worker × 4 GPUs @ 600W on its own node + self._write_nvidia_csv( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + start, end, watts_per_gpu=600.0, num_gpus=4, + ) + # 1 decode worker × 4 GPUs @ 400W on its own node + self._write_nvidia_csv( + tmp_path / "perf_samples_decode_w0_dn0.csv", + start, end, watts_per_gpu=400.0, num_gpus=4, + ) + + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 1000.0, + "output_throughput": 500.0, + "benchmark_start_time_unix": start, + "benchmark_end_time_unix": end, + "duration": 60.0, + "total_output_tokens": 30_000, + "total_input_tokens": 240_000, + } + env = { + **multinode_env_vars, + "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"), + } + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text()) + + # Per-stage attribution: prefill_energy / input, decode_energy / output. + # Prefill: 600 × 4 × 60 = 144_000 J → / 240_000 = 0.6 J/input_tok. + # Decode: 400 × 4 × 60 = 96_000 J → / 30_000 = 3.2 J/output_tok. + assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01) + assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01) + + # Per-worker breakdown labeled with role. + workers = patched["power_by_worker"] + assert {w["role"] for w in workers} == {"prefill", "decode"} + for w in workers: + assert w["num_gpus"] == 4 + assert w["worker_idx"] == 0 + + def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, multinode_env_vars): + """Multinode but DISAGG=false → keep cluster-wide ratios, no J/input. + + Sanity check that the disagg flag is the gate, not just multinode-ness.""" + start, end = 1_700_000_100.0, 1_700_000_160.0 + self._write_nvidia_csv( + tmp_path / "perf_samples_agg_w0_n0.csv", + start, end, watts_per_gpu=500.0, num_gpus=4, + ) + + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 1000.0, + "output_throughput": 500.0, + "benchmark_start_time_unix": start, + "benchmark_end_time_unix": end, + "duration": 60.0, + "total_output_tokens": 30_000, + "total_input_tokens": 240_000, + } + # Multinode env, but DISAGG=false → non-disagg multinode (rare but valid). + env = { + **multinode_env_vars, + "DISAGG": "false", + "GPU_METRICS_CSV_GLOB": str(tmp_path / "perf_samples_*.csv"), + } + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text()) + assert "joules_per_input_token" not in patched + # power_by_worker still emitted (filename labels exist) — useful for + # observability even on non-disagg runs. + assert patched["power_by_worker"][0]["role"] == "agg" From 1af17ab305bc7b5e99f87c1b81ce4e94557f5d01 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 11:08:07 -0700 Subject: [PATCH 07/14] chore(perf-changelog): re-trigger sweep for per-worker power aggregation Workflow's paths: filter only fires on perf-changelog.yaml. This bumps the dsv4-fp4-gb300-dynamo-sglang entry so the sweep picks up the new per-worker power + per-stage J/token aggregation from 24f46ffe. Co-Authored-By: Claude Opus 4.7 --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9fdae2fd6..506862307 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3198,4 +3198,5 @@ description: - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)." - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema." + - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: power_by_worker[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, and joules_per_input_token using per-stage energy attribution (prefill_energy / input_tokens). joules_per_output_token and joules_per_total_token now use per-stage math for disagg runs. Backward compatible: single-node and non-disagg multinode keep cluster-wide ratios." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574 From 5b3bcbb6055bd829fbd101ab3e4284f4ebdba3b6 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 13:30:14 -0700 Subject: [PATCH 08/14] feat(power): realign agg JSON fields with InferenceX-app METRIC_KEYS + add temp/util/mem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realigns the per-worker / per-stage schema introduced in 06558b9c to match the canonical METRIC_KEYS already declared in InferenceX-app (packages/app/src/lib/metric-keys.ts). Previously this PR overrode cluster-wide joules_per_output_token for disagg runs, which would silently shift the meaning of a shared field. New per-stage values are emitted as separate flat scalars so the cluster keys stay byte-stable. Schema changes: - Revert disagg override on joules_per_output_token and joules_per_total_token — both are now ALWAYS cluster-wide (total_system_energy / token_count), matching single-node math and the frontend's existing axis labels. - Add new disagg-only flat scalars (already in frontend METRIC_KEYS): prefill_avg_power_w cluster mean across prefill workers decode_avg_power_w cluster mean across decode workers joules_per_output_token_decode decode_energy / output_tokens joules_per_input_token unchanged (prefill_energy / input_tokens). - Rename power_by_worker[] -> workers[] to match InferenceX-app's BenchmarkRow.workers / WorkerPower interface. - Each workers[] entry extended with per-worker telemetry: avg_temp_c, peak_temp_c, avg_util_pct, avg_mem_used_mb - Add matching cluster-wide telemetry scalars (per-GPU mean, omitted when CSV lacks the column). Implementation: - _read_samples + _aggregate_rows refactored to extract all metric columns in one pass (single-vendor regex per metric, gracefully degrades when a column is absent). - aggregate_power() preserved as a thin compat wrapper returning the old (power, num_gpus) tuple so external callers don't break. - Per-stage prefill_avg_power_w / decode_avg_power_w use weighted mean by num_gpus (matches how cluster avg_power_w is computed). - Frontend-labeled CSVs still excluded from per-stage energy attribution; included in cluster totals. Tests: 107/107 pass (88 existing baseline preserved, 14 new telemetry tests, 5 schema-renamed tests updated in place). New coverage: temp / util / mem extraction across NVIDIA + AMD + srt-slurm CSV schemas, peak vs avg distinction, missing-column graceful degradation, per- worker telemetry, per-stage weighted-mean scalars. Co-Authored-By: Claude Opus 4.7 --- perf-changelog.yaml | 15 +- utils/aggregate_power.py | 504 ++++++++++++++++++------- utils/test_aggregate_power.py | 677 ++++++++++++++++++++++++++++++++-- utils/test_process_result.py | 39 +- 4 files changed, 1053 insertions(+), 182 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 506862307..62bedab67 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3198,5 +3198,18 @@ description: - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)." - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema." - - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: power_by_worker[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, and joules_per_input_token using per-stage energy attribution (prefill_energy / input_tokens). joules_per_output_token and joules_per_total_token now use per-stage math for disagg runs. Backward compatible: single-node and non-disagg multinode keep cluster-wide ratios." + - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: workers[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, plus per-stage scalars (prefill_avg_power_w, decode_avg_power_w, joules_per_input_token = prefill_energy / input_tokens, joules_per_output_token_decode = decode_energy / output_tokens). joules_per_output_token and joules_per_total_token stay cluster-wide on all topologies so the metric is comparable across single-node, multinode-agg, and multinode-disagg. Per-stage scalars emitted only for disagg runs with both prefill and decode workers present. workers[] entries also carry per-worker avg_temp_c/peak_temp_c/avg_util_pct/avg_mem_used_mb when the CSV exposes those columns." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang-disagg + - glm5-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg + - kimik2.5-fp4-mi355x-vllm-disagg + - minimaxm2.5-fp8-mi355x-vllm-disagg + description: + - "Smoke run validating AMD multinode measured-power aggregation — the AMD analogue of the NVIDIA gb300/srt-slurm path (PR #1574). No config change; entry exists to trigger a sweep that produces the first AMD multinode agg JSONs with avg_power_w + joules_per_*_token + per-worker workers[] populated from per-node amd-smi perfmon CSVs." + - "The AMD amd_utils SLURM job has no orchestrator perfmon, so each SGLang/vLLM disagg node starts its own amd-smi monitor via start_perf_monitor (benchmarks/benchmark_lib.sh), writing perf_samples__w_.csv into the NFS-shared /benchmark_logs/perfmon mount (wired in amd_utils/job.slurm). launch_mi355x-amds.sh collects the per-node CSVs into the GH workspace before the EXIT trap wipes the logs dir and sets GPU_METRICS_CSV_GLOB so the existing Process-result step runs the same vendor-agnostic utils/aggregate_power.py used for NVIDIA: per-source GPU-id namespacing (8 GPUs/node on MI355X, so a TP16 worker over 2 nodes counts 16 GPUs not 8), per-stage prefill/decode energy attribution, and per-worker temp/util/mem when amd-smi exposes those columns." + - "Covers both engine paths: SGLang disagg (server_sglang.sh role = NODE_RANK bucketed by PREFILL_NODES_PER_WORKER / NODE_OFFSET) and vLLM disagg (server_vllm.sh one worker per node, ranks [0,xP) prefill / [xP,xP+yD) decode). Monitoring is best-effort end-to-end — a missing amd-smi or empty CSV skips power patching without failing the benchmark upload; DISAGG=true threads through to per-stage attribution while agg/non-disagg runs still get cluster-wide power." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574 diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index 962c9167d..ee4327b83 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -1,44 +1,75 @@ -"""Aggregate measured GPU power from a vendor SMI CSV into the agg result JSON. +"""Aggregate measured GPU telemetry (power, temp, utilization, memory) from a +vendor SMI CSV into the agg result JSON. Reads a GPU-metrics CSV produced by `start_gpu_monitor` (nvidia-smi or amd-smi) or by srt-slurm's per-node perfmon (multinode), filters samples to the benchmark load window using start/end Unix timestamps written by benchmark_serving.py, and -patches the aggregated result JSON with cluster-wide and per-worker power data +patches the aggregated result JSON with cluster-wide and per-worker telemetry consumed by InferenceX-app's ETL. Cluster-wide fields (always written when any power data exists): - avg_power_w: mean per-GPU power draw (W) during the load window - - joules_per_output_token: energy / total_output_tokens - - joules_per_total_token: energy / (input + output) tokens - -For disaggregated multinode runs (DISAGG=true), the numerator for the J/token -metrics shifts to a per-stage attribution: prefill workers' energy is divided -by input tokens, decode workers' energy by output tokens. Per-stage power is -where the meaningful efficiency signal lives — total-energy ratios mostly just -re-scale the same number by different denominators. - - - joules_per_input_token: prefill_energy / total_input_tokens (disagg only) - - joules_per_output_token: decode_energy / total_output_tokens (overridden) - - joules_per_total_token: (prefill_energy + decode_energy) / total_tokens (overridden) - -Per-worker breakdown (multinode only — single-node has no role concept): - - power_by_worker: list of {role, worker_idx, hosts[], num_gpus, avg_power_w} - where role is "prefill", "decode", "agg", or "frontend". - -srt-slurm encodes the worker role and index in the perfmon CSV filename: -`perf_samples__w_.csv` — see srt-slurm fork's -benchmark_stage._start_perf_monitor. Filenames that don't match this pattern -(e.g. single-node `gpu_metrics.csv`) fall back to a single cluster-wide bucket. + - joules_per_output_token: total_system_energy / total_output_tokens + (cluster-wide; always — same math single-node and + multinode disagg, so the metric stays comparable + across topologies in the dashboard) + - joules_per_total_token: total_system_energy / (input + output) tokens + (cluster-wide; always) + - avg_temp_c: mean per-GPU temperature (Celsius), when the + CSV exposes a temperature column + - peak_temp_c: max instantaneous per-GPU temperature in window + - avg_util_pct: mean per-GPU GPU-utilization percent + - avg_mem_used_mb: mean per-GPU memory used (MiB/MB) + +For disaggregated multinode runs (DISAGG=true) where filenames carry the perfmon +role/index encoding AND both prefill+decode workers are present, additional flat +per-stage scalars are emitted alongside (NOT instead of) the cluster-wide keys: + + - prefill_avg_power_w: per-GPU mean power across prefill workers + - decode_avg_power_w: per-GPU mean power across decode workers + - joules_per_input_token: prefill_energy / total_input_tokens + (per-stage attribution — prefill processes + input tokens, so its energy / input gives the + prefill-side per-token cost) + - joules_per_output_token_decode: decode_energy / total_output_tokens + (per-stage attribution; the _decode suffix is + load-bearing — keeps the cluster-wide + joules_per_output_token comparable across + single-node and disagg deployments and exposes + decode-only energy as a separate key for users + who specifically want it.) + +Per-worker breakdown (multinode only — single-node has no role concept), emitted +under the `workers` key to match InferenceX-app's BenchmarkRow.workers shape: + - workers: list of {role, worker_idx, hosts[], num_gpus, avg_power_w, + avg_temp_c?, peak_temp_c?, avg_util_pct?, avg_mem_used_mb?} + where role is "prefill", "decode", "agg", or "frontend". + +Both multinode paths encode the worker role and index in the perfmon CSV +filename: `perf_samples__w_.csv` — NVIDIA via the +srt-slurm fork's benchmark_stage._start_perf_monitor, AMD via start_perf_monitor +in benchmarks/benchmark_lib.sh (each SGLang/vLLM disagg node starts its own +amd-smi monitor). Filenames that don't match this pattern (e.g. single-node +`gpu_metrics.csv`) fall back to a single cluster-wide bucket. Multinode: accepts multiple CSV paths (one per worker node). GPU indices are namespaced by source CSV stem to avoid the same-index collision across nodes — e.g. 8 nodes each reporting indices 0..3 would otherwise be miscounted as 4 total GPUs instead of 32. -Vendor schema detection is regex-based: any timestamp-like column + any column -whose name contains "power" (excluding "limit"/"cap"/"max") is picked up. -NVIDIA emits "power.draw [W]"; AMD's amd-smi varies by version; srt-slurm's -perfmon emits "power_w". All are handled. +Vendor schema detection is regex-based: + - Power: timestamp + column whose name contains "power" (excluding + "limit"/"cap"/"max"/"min"). NVIDIA: "power.draw [W]". AMD: "socket_power". + srt-slurm: "power_w". + - Temperature: column name contains "temp". NVIDIA: "temperature.gpu". AMD: + "temperature". srt-slurm: "temp_c". Unit: Celsius. + - Utilization: column name starts with "utilization" or contains "util". + NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent. + - Memory: column name contains "mem" but not "total" (avoid "memory.total"). + NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB. + +Power is required for aggregation to fire; the other metrics degrade gracefully +when their columns are absent (those fields are simply omitted from the output). This script is best-effort. Missing or malformed CSV exits 0 without patching so a monitoring hiccup never breaks the benchmark upload. @@ -60,6 +91,10 @@ _POWER_COL_RE = re.compile(r"power", re.IGNORECASE) _POWER_EXCLUDE_RE = re.compile(r"limit|cap|max|min", re.IGNORECASE) +_TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE) +_UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE) +_MEM_COL_RE = re.compile(r"mem", re.IGNORECASE) +_MEM_EXCLUDE_RE = re.compile(r"total", re.IGNORECASE) _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE) _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE) _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?") @@ -72,6 +107,11 @@ r"^perf_samples_(?Pprefill|decode|agg|frontend)_w(?P\d+)_(?P.+)$" ) +# Metric names recognized in the multi-metric row dicts. Power is special-cased +# as required; others are best-effort. +_METRICS_AVG = ("power", "temp", "util", "mem") # mean across samples +_METRICS_MAX = ("temp",) # additionally compute peak (max raw) + def _parse_timestamp(value: str) -> float | None: """Best-effort timestamp parse to Unix epoch seconds (local wall clock). @@ -107,11 +147,12 @@ def _parse_timestamp(value: str) -> float | None: return dt.astimezone(timezone.utc).timestamp() -def _parse_power(value: str) -> float | None: - """Extract the first numeric value from a power cell. +def _parse_numeric_cell(value: str) -> float | None: + """Extract the first numeric value from a cell. - nvidia-smi formats power as "412.34 W"; some configurations report - "[N/A]" when power capping is disabled. AMD reports a bare number. + Vendors decorate values with units ("412.34 W", "65 C", "85 %", "1024 MiB") + or report "[N/A]" when a sensor is unavailable. We strip and pull the first + signed-decimal token; returns None for empty / NA / non-numeric cells. """ value = value.strip() if not value or value.lower() in {"[n/a]", "n/a", "na"}: @@ -125,12 +166,19 @@ def _parse_power(value: str) -> float | None: return None +# Back-compat shim — some external callers may have imported _parse_power. +_parse_power = _parse_numeric_cell + + def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | None]: """Return (timestamp_col, power_col, gpu_index_col) from a CSV header. Power column: contains "power" and not "limit"/"cap"/"max"/"min". Timestamp column: contains "time". GPU index column: optional — used to count distinct GPUs per sample. + + Kept for back-compat with tests that imported _detect_columns directly; + new code uses _detect_all_columns to also pick up temp/util/mem. """ timestamp_col = next((c for c in header if _TIMESTAMP_COL_RE.search(c)), None) power_col = next( @@ -141,6 +189,39 @@ def _detect_columns(header: list[str]) -> tuple[str | None, str | None, str | No return timestamp_col, power_col, gpu_col +def _detect_all_columns(header: list[str]) -> dict[str, str | None]: + """Return a mapping of role -> column name for every metric we know about. + + Roles: timestamp, gpu, power, temp, util, mem. Missing roles map to None. + + The detection is greedy + first-match: with a vendor like NVIDIA whose + header lists `utilization.gpu` followed by `utilization.memory`, the + util slot picks the first; that's fine — we only need ONE util column and + `utilization.gpu` is the canonical one. Memory excludes "total" so + `memory.used` wins over `memory.total`. + """ + timestamp_col = next((c for c in header if _TIMESTAMP_COL_RE.search(c)), None) + power_col = next( + (c for c in header if _POWER_COL_RE.search(c) and not _POWER_EXCLUDE_RE.search(c)), + None, + ) + temp_col = next((c for c in header if _TEMP_COL_RE.search(c)), None) + util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None) + mem_col = next( + (c for c in header if _MEM_COL_RE.search(c) and not _MEM_EXCLUDE_RE.search(c)), + None, + ) + gpu_col = next((c for c in header if _GPU_INDEX_COL_RE.match(c.strip())), None) + return { + "timestamp": timestamp_col, + "gpu": gpu_col, + "power": power_col, + "temp": temp_col, + "util": util_col, + "mem": mem_col, + } + + def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None: """Extract (role, worker_idx, host) from a srt-slurm perfmon CSV filename. @@ -156,12 +237,16 @@ def _parse_perfmon_label(path: Path) -> tuple[str, int, str] | None: def _read_samples( path: Path, start_unix: float, end_unix: float -) -> tuple[list[tuple[float, float, str | None]], bool] | None: - """Read one CSV → list of (timestamp_bucket, power_w, gpu_id) in window. +) -> tuple[list[tuple[float, str | None, dict[str, float]]], bool] | None: + """Read one CSV → list of (timestamp_bucket, gpu_id, {metric: value}) in window. Returns (rows, saw_gpu_col) on success, None if the file is unreadable / - missing the required columns. Empty rows list is valid (file readable but - no samples landed in the window). + missing the required power column. Empty rows list is valid (file readable + but no samples landed in the window). + + Each row's metric dict carries whichever of power/temp/util/mem the CSV + exposed (power is always present — rows lacking it are skipped). Missing + metric columns simply don't appear in the dict; callers gracefully degrade. """ if not path.is_file() or path.stat().st_size == 0: return None @@ -170,72 +255,139 @@ def _read_samples( reader = csv.DictReader(f, skipinitialspace=True) header = [c.strip() for c in (reader.fieldnames or [])] reader.fieldnames = header - timestamp_col, power_col, gpu_col = _detect_columns(header) + cols = _detect_all_columns(header) + timestamp_col = cols["timestamp"] + power_col = cols["power"] if not timestamp_col or not power_col: return None - rows: list[tuple[float, float, str | None]] = [] + gpu_col = cols["gpu"] + # Map metric name -> CSV column. Power is required (we just + # checked); temp/util/mem are optional. + metric_cols: dict[str, str] = {"power": power_col} + for metric in ("temp", "util", "mem"): + col = cols[metric] + if col is not None: + metric_cols[metric] = col + rows: list[tuple[float, str | None, dict[str, float]]] = [] for row in reader: ts = _parse_timestamp((row.get(timestamp_col) or "").strip()) - pw = _parse_power((row.get(power_col) or "").strip()) - if ts is None or pw is None: + if ts is None: continue if ts < start_unix or ts > end_unix: continue + # Power must parse; rows with [N/A] or empty power are useless + # for aggregation (same behavior as before the multi-metric + # extension). + pw = _parse_numeric_cell((row.get(power_col) or "").strip()) + if pw is None: + continue + values: dict[str, float] = {"power": pw} + for metric, col in metric_cols.items(): + if metric == "power": + continue + v = _parse_numeric_cell((row.get(col) or "").strip()) + if v is not None: + values[metric] = v gpu_id = (row.get(gpu_col) or "").strip() if gpu_col else None - rows.append((round(ts, 3), pw, gpu_id or None)) + rows.append((round(ts, 3), gpu_id or None, values)) return rows, gpu_col is not None except (OSError, csv.Error): return None def _aggregate_rows( - sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]], + sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]], *, namespace: bool, -) -> tuple[float, int] | None: - """Merge rows across CSVs into (per_gpu_avg_power_w, num_gpus). +) -> dict | None: + """Merge rows across CSVs into a metric-dict + num_gpus. `sources` is a list of (path, rows, saw_gpu_col) for the CSVs to roll up together. Rows are bucketed by ms-rounded timestamp so nodes with sub-ms clock drift land in the same bucket. GPU indices are namespaced by the source path's stem when `namespace=True` (multi-source case) to keep same-local-index across nodes from collapsing. + + Returns a dict with at minimum {"power": float, "num_gpus": int}. Each + additional metric (temp/util/mem) is included only when at least one + source emitted it. peak_temp is the global max across the window + (instantaneous, not per-bucket-mean). """ - per_sample_total: dict[float, float] = {} - per_sample_row_count: dict[float, int] = {} + # Per-bucket totals keyed by metric name. Bucket = ms-rounded timestamp. + per_sample_total: dict[str, dict[float, float]] = {m: {} for m in _METRICS_AVG} + per_sample_count: dict[str, dict[float, int]] = {m: {} for m in _METRICS_AVG} + per_sample_row_count: dict[float, int] = {} # for no-gpu-col GPU inference per_sample_gpus: dict[float, set[str]] = {} gpu_keys: set[str] = set() saw_gpu_col_any = False + saw_metric: dict[str, bool] = {m: False for m in _METRICS_AVG} + peak_per_metric: dict[str, float] = {} for path, rows, saw_gpu_col in sources: if saw_gpu_col: saw_gpu_col_any = True - for bucket, pw, gpu_id in rows: - per_sample_total[bucket] = per_sample_total.get(bucket, 0.0) + pw + for bucket, gpu_id, values in rows: per_sample_row_count[bucket] = per_sample_row_count.get(bucket, 0) + 1 + for metric, v in values.items(): + if metric not in per_sample_total: + continue + per_sample_total[metric][bucket] = ( + per_sample_total[metric].get(bucket, 0.0) + v + ) + per_sample_count[metric][bucket] = ( + per_sample_count[metric].get(bucket, 0) + 1 + ) + saw_metric[metric] = True + if metric in _METRICS_MAX: + cur = peak_per_metric.get(metric) + peak_per_metric[metric] = v if cur is None else max(cur, v) if gpu_id is not None: ns_id = f"{path.stem}:{gpu_id}" if namespace else gpu_id per_sample_gpus.setdefault(bucket, set()).add(ns_id) gpu_keys.add(ns_id) - if not per_sample_total: + if not per_sample_total["power"]: return None + # GPU count: # - If any path exposed a GPU column, trust distinct (namespaced) GPU IDs. # - Otherwise, infer from row count (one row per GPU per sample, summed # across all paths' rows that fell into the same timestamp bucket). if saw_gpu_col_any and gpu_keys: num_gpus = len(gpu_keys) - per_sample_mean_per_gpu = [ - total / max(len(per_sample_gpus.get(ts, ())), 1) - for ts, total in per_sample_total.items() - ] else: num_gpus = max(per_sample_row_count.values()) - per_sample_mean_per_gpu = [ - total / per_sample_row_count[ts] for ts, total in per_sample_total.items() - ] - return mean(per_sample_mean_per_gpu), num_gpus + + def _avg_per_gpu(metric: str) -> float | None: + if not saw_metric.get(metric): + return None + totals = per_sample_total[metric] + if not totals: + return None + if saw_gpu_col_any and gpu_keys: + # bucket mean = sum / distinct GPU count in that bucket + per_sample_mean = [ + total / max(len(per_sample_gpus.get(ts, ())), 1) + for ts, total in totals.items() + ] + else: + # bucket mean = sum / row count in that bucket (= GPU count when + # one row per GPU per sample, the universal vendor convention) + per_sample_mean = [ + total / per_sample_count[metric][ts] for ts, total in totals.items() + ] + return mean(per_sample_mean) if per_sample_mean else None + + result: dict = {"num_gpus": num_gpus, "power": _avg_per_gpu("power")} + for metric in ("temp", "util", "mem"): + avg = _avg_per_gpu(metric) + if avg is not None: + result[metric] = avg + # Peak (max raw value, not per-bucket-mean): meaningful for temperature + # where the worst-case GPU's hottest sample is the thermal-headroom signal. + if "temp" in peak_per_metric: + result["peak_temp"] = peak_per_metric["temp"] + return result def aggregate_power( @@ -245,6 +397,23 @@ def aggregate_power( ) -> tuple[float, int] | None: """Return (per_gpu_avg_power_w, num_gpus) for samples in [start, end]. + Backward-compatible wrapper around aggregate_metrics that returns just the + legacy (avg_power_w, num_gpus) tuple for callers (and tests) that don't + need temperature/util/memory. + """ + res = aggregate_metrics(csv_path, start_unix, end_unix) + if res is None: + return None + return res["power"], res["num_gpus"] + + +def aggregate_metrics( + csv_path: Path | Iterable[Path], + start_unix: float, + end_unix: float, +) -> dict | None: + """Return a dict of cluster-wide per-GPU metrics for samples in [start, end]. + Accepts either a single Path (single-node case) or an iterable of Paths (multinode case: one CSV per worker node, all written by srt-slurm's perfmon). For multi-path inputs, GPU indices are namespaced by source @@ -254,12 +423,15 @@ def aggregate_power( Returns None if no CSVs are usable, none have a detectable power column, or no rows fall in the window across all paths. + + Result keys: num_gpus, power (always when not None); temp, util, mem, + peak_temp (only when the corresponding column existed in at least one CSV). """ paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) if not paths or end_unix <= start_unix: return None - sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = [] + sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]] = [] for path in paths: read = _read_samples(path, start_unix, end_unix) if read is None: @@ -277,9 +449,13 @@ def aggregate_power_by_worker( start_unix: float, end_unix: float, ) -> list[dict] | None: - """Group CSVs by (role, worker_idx) and return per-worker power rollups. + """Group CSVs by (role, worker_idx) and return per-worker telemetry rollups. + + Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w, + avg_temp_c?, peak_temp_c?, avg_util_pct?, avg_mem_used_mb?}. + The optional fields appear only when the CSVs for that worker carried + temperature / utilization / memory columns. - Each entry: {role, worker_idx, hosts: sorted list, num_gpus, avg_power_w}. Returns None if no CSVs have parseable filenames OR no labeled CSV yields usable samples. Unlabeled CSVs in the input are silently skipped — they can't be attributed to a worker. @@ -309,7 +485,7 @@ def aggregate_power_by_worker( out: list[dict] = [] for (role, worker_idx), worker_paths in by_worker.items(): - sources: list[tuple[Path, list[tuple[float, float, str | None]], bool]] = [] + sources: list[tuple[Path, list[tuple[float, str | None, dict[str, float]]], bool]] = [] for path in worker_paths: read = _read_samples(path, start_unix, end_unix) if read is None: @@ -323,16 +499,22 @@ def aggregate_power_by_worker( result = _aggregate_rows(sources, namespace=len(sources) > 1) if result is None: continue - avg_power_w, num_gpus = result - out.append( - { - "role": role, - "worker_idx": worker_idx, - "hosts": sorted(hosts_by_worker[(role, worker_idx)]), - "num_gpus": num_gpus, - "avg_power_w": round(avg_power_w, 3), - } - ) + entry: dict = { + "role": role, + "worker_idx": worker_idx, + "hosts": sorted(hosts_by_worker[(role, worker_idx)]), + "num_gpus": result["num_gpus"], + "avg_power_w": round(result["power"], 3), + } + if "temp" in result: + entry["avg_temp_c"] = round(result["temp"], 3) + if "peak_temp" in result: + entry["peak_temp_c"] = round(result["peak_temp"], 3) + if "util" in result: + entry["avg_util_pct"] = round(result["util"], 3) + if "mem" in result: + entry["avg_mem_used_mb"] = round(result["mem"], 3) + out.append(entry) if not out: return None # Stable order: role (prefill < decode < agg < frontend), then worker_idx. @@ -418,13 +600,21 @@ def patch_agg_result( joules_per_output_token: float, joules_per_total_token: float, joules_per_input_token: float | None = None, - power_by_worker: list[dict] | None = None, + joules_per_output_token_decode: float | None = None, + prefill_avg_power_w: float | None = None, + decode_avg_power_w: float | None = None, + avg_temp_c: float | None = None, + peak_temp_c: float | None = None, + avg_util_pct: float | None = None, + avg_mem_used_mb: float | None = None, + workers: list[dict] | None = None, ) -> None: - """Read the agg JSON, add the power keys, and write it back atomically. + """Read the agg JSON, add the telemetry keys, and write it back atomically. - `joules_per_input_token` and `power_by_worker` are optional — omitted from - the JSON when None (kept that way so single-node and non-disagg multinode - agg JSONs don't gain meaningless null fields). + All optional fields (anything except avg_power_w / joules_per_output_token / + joules_per_total_token) are omitted from the JSON when None — keeps the + pre-disagg / single-node agg JSONs from gaining meaningless null fields, and + keeps non-power-instrumented runs (e.g. no temp sensor) from emitting nulls. """ data = json.loads(agg_path.read_text(encoding="utf-8")) data["avg_power_w"] = round(avg_power_w, 3) @@ -432,39 +622,82 @@ def patch_agg_result( data["joules_per_total_token"] = round(joules_per_total_token, 6) if joules_per_input_token is not None: data["joules_per_input_token"] = round(joules_per_input_token, 6) - if power_by_worker is not None: - data["power_by_worker"] = power_by_worker + if joules_per_output_token_decode is not None: + data["joules_per_output_token_decode"] = round(joules_per_output_token_decode, 6) + if prefill_avg_power_w is not None: + data["prefill_avg_power_w"] = round(prefill_avg_power_w, 3) + if decode_avg_power_w is not None: + data["decode_avg_power_w"] = round(decode_avg_power_w, 3) + if avg_temp_c is not None: + data["avg_temp_c"] = round(avg_temp_c, 3) + if peak_temp_c is not None: + data["peak_temp_c"] = round(peak_temp_c, 3) + if avg_util_pct is not None: + data["avg_util_pct"] = round(avg_util_pct, 3) + if avg_mem_used_mb is not None: + data["avg_mem_used_mb"] = round(avg_mem_used_mb, 3) + if workers is not None: + data["workers"] = workers tmp_path = agg_path.with_suffix(agg_path.suffix + ".tmp") tmp_path.write_text(json.dumps(data, indent=2), encoding="utf-8") tmp_path.replace(agg_path) -def _disagg_stage_energies( - power_by_worker: list[dict], duration: float -) -> tuple[float, float] | None: - """Sum per-worker energy for prefill vs decode workers (J). +def _disagg_stage_rollup( + workers: list[dict], duration: float +) -> dict | None: + """Roll up per-worker entries into per-stage energy + per-GPU mean power. - Returns (prefill_energy_j, decode_energy_j) or None if either stage is - absent — without both stages we can't do per-stage attribution and the - caller should fall back to total-energy math. + Returns a dict with keys: + - prefill_energy_j, decode_energy_j: sum of (avg_power_w * num_gpus * + duration) across workers in each role + - prefill_avg_power_w, decode_avg_power_w: per-GPU mean power weighted + by num_gpus (matches the cluster avg_power_w semantics, but scoped to + each role) + + Returns None if either stage is absent — without both stages we can't do + per-stage attribution and the caller should fall back to total-energy math. """ - prefill_e = 0.0 - decode_e = 0.0 + prefill_energy = 0.0 + decode_energy = 0.0 + prefill_gpus = 0 + decode_gpus = 0 + prefill_pw_x_gpus = 0.0 + decode_pw_x_gpus = 0.0 has_prefill = False has_decode = False - for w in power_by_worker: + for w in workers: e = w["avg_power_w"] * w["num_gpus"] * duration if w["role"] == "prefill": - prefill_e += e + prefill_energy += e + prefill_gpus += w["num_gpus"] + prefill_pw_x_gpus += w["avg_power_w"] * w["num_gpus"] has_prefill = True elif w["role"] == "decode": - decode_e += e + decode_energy += e + decode_gpus += w["num_gpus"] + decode_pw_x_gpus += w["avg_power_w"] * w["num_gpus"] has_decode = True # "frontend" / "agg" / unknown roles deliberately excluded — they - # don't belong to either stage's per-token cost. + # don't belong to either stage's per-token cost or per-stage power. if not (has_prefill and has_decode): return None - return prefill_e, decode_e + return { + "prefill_energy_j": prefill_energy, + "decode_energy_j": decode_energy, + "prefill_avg_power_w": prefill_pw_x_gpus / prefill_gpus if prefill_gpus else None, + "decode_avg_power_w": decode_pw_x_gpus / decode_gpus if decode_gpus else None, + } + + +# Backward-compat shim — the original API returned just the two energy values. +def _disagg_stage_energies( + workers: list[dict], duration: float +) -> tuple[float, float] | None: + res = _disagg_stage_rollup(workers, duration) + if res is None: + return None + return res["prefill_energy_j"], res["decode_energy_j"] def run( @@ -484,8 +717,8 @@ def run( start, end, duration, total_output, total_input = window paths = [csv_path] if isinstance(csv_path, Path) else list(csv_path) - result = aggregate_power(paths, start, end) - if result is None: + cluster = aggregate_metrics(paths, start, end) + if cluster is None: label = str(paths[0]) if len(paths) == 1 else f"{len(paths)} CSVs" print( f"[aggregate_power] No usable power samples in {label} for " @@ -493,53 +726,55 @@ def run( file=sys.stderr, ) return 0 - avg_power_w, num_gpus = result + avg_power_w = cluster["power"] + num_gpus = cluster["num_gpus"] + avg_temp_c = cluster.get("temp") + peak_temp_c = cluster.get("peak_temp") + avg_util_pct = cluster.get("util") + avg_mem_used_mb = cluster.get("mem") # Per-worker rollup is best-effort: only emitted when CSV filenames carry # the perfmon role/index encoding. Single-node `gpu_metrics.csv` won't # parse, so aggregate_power_by_worker returns None and the field is omitted. - power_by_worker = aggregate_power_by_worker(paths, start, end) - - # Cluster-wide energy baseline. Used as the fallback numerator when - # per-stage attribution isn't available. + workers = aggregate_power_by_worker(paths, start, end) + + # Cluster-wide energy + per-token attribution. We ALWAYS report + # joules_per_output_token / joules_per_total_token as cluster-wide ratios + # (total_system_energy / token_count), regardless of disagg. This keeps the + # metric comparable across single-node, multinode-agg, and multinode-disagg + # topologies in the dashboard. Per-stage attribution lives in separate + # *_decode / joules_per_input_token keys (only emitted when disagg AND both + # stages present). total_system_energy_j = avg_power_w * num_gpus * duration total_tokens = total_output + total_input + joules_per_output_token = total_system_energy_j / total_output + joules_per_total_token = ( + total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token + ) joules_per_input_token: float | None = None + joules_per_output_token_decode: float | None = None + prefill_avg_power_w: float | None = None + decode_avg_power_w: float | None = None - if disagg and power_by_worker is not None: - stage = _disagg_stage_energies(power_by_worker, duration) + if disagg and workers is not None: + stage = _disagg_stage_rollup(workers, duration) if stage is not None: - prefill_energy_j, decode_energy_j = stage # Per-stage attribution: prefill workers process input tokens, # decode workers process output tokens. Strictly more accurate # than total-energy ratios when prefill/decode have different # per-GPU power profiles (typical: prefill is compute-bound and - # draws more than memory-bound decode). - joules_per_output_token = decode_energy_j / total_output - joules_per_input_token = ( - prefill_energy_j / total_input if total_input > 0 else None + # draws more than memory-bound decode). Exposed as additional + # flat scalars so the cluster-wide joules_per_output_token stays + # comparable across topologies. + prefill_avg_power_w = stage["prefill_avg_power_w"] + decode_avg_power_w = stage["decode_avg_power_w"] + joules_per_output_token_decode = ( + stage["decode_energy_j"] / total_output ) - joules_per_total_token = ( - (prefill_energy_j + decode_energy_j) / total_tokens - if total_tokens > 0 - else joules_per_output_token - ) - else: - # disagg=true but workers don't split into prefill+decode (e.g. - # only one role's CSVs survived). Fall back to cluster math. - joules_per_output_token = total_system_energy_j / total_output - joules_per_total_token = ( - total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token + joules_per_input_token = ( + stage["prefill_energy_j"] / total_input if total_input > 0 else None ) - else: - # Single-node or non-disagg multinode: keep the cluster-wide ratios - # backward-compatible with everything that consumed the pre-disagg - # schema. - joules_per_output_token = total_system_energy_j / total_output - joules_per_total_token = ( - total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token - ) if not agg_result.is_file(): print( @@ -555,14 +790,21 @@ def run( joules_per_output_token, joules_per_total_token, joules_per_input_token=joules_per_input_token, - power_by_worker=power_by_worker, + joules_per_output_token_decode=joules_per_output_token_decode, + prefill_avg_power_w=prefill_avg_power_w, + decode_avg_power_w=decode_avg_power_w, + avg_temp_c=avg_temp_c, + peak_temp_c=peak_temp_c, + avg_util_pct=avg_util_pct, + avg_mem_used_mb=avg_mem_used_mb, + workers=workers, ) except (OSError, json.JSONDecodeError) as exc: print(f"[aggregate_power] Failed to patch {agg_result}: {exc}", file=sys.stderr) return 0 worker_summary = ( - f"workers={len(power_by_worker)}" if power_by_worker else "workers=cluster-only" + f"workers={len(workers)}" if workers else "workers=cluster-only" ) jpit_summary = ( f"joules_per_input_token={joules_per_input_token:.4f} " @@ -612,10 +854,12 @@ def main() -> int: parser.add_argument( "--disagg", action="store_true", - help="Treat as disaggregated inference: emit joules_per_input_token using " - "per-stage energy attribution (prefill workers' energy / input tokens, " - "decode workers' energy / output tokens). Requires CSV filenames to carry " - "the perfmon role/index encoding.", + help="Treat as disaggregated inference: emit prefill_avg_power_w, " + "decode_avg_power_w, joules_per_input_token, and " + "joules_per_output_token_decode using per-stage energy attribution " + "(prefill workers' energy / input tokens, decode workers' energy / " + "output tokens). Requires CSV filenames to carry the perfmon role/index " + "encoding.", ) args = parser.parse_args() diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index ed6ca69ab..fb33ea265 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -23,10 +23,12 @@ sys.path.insert(0, str(Path(__file__).parent)) from aggregate_power import ( # noqa: E402 + _detect_all_columns, _detect_columns, _parse_perfmon_label, _parse_power, _parse_timestamp, + aggregate_metrics, aggregate_power, aggregate_power_by_worker, patch_agg_result, @@ -824,15 +826,20 @@ def test_aggregate_power_by_worker_skips_unlabeled_silently(tmp_path: Path): # --------------------------------------------------------------------------- # -def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path): - """Full disagg pipeline: per-worker breakdown + per-stage J/input + J/output. +def test_run_disagg_emits_workers_and_per_stage_joules(tmp_path: Path): + """Full disagg pipeline: workers[] breakdown + per-stage scalars next to + cluster-wide joules. Topology: 2 prefill workers × 4 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W. Over a 10s bench window with 8000 input + 1000 output tokens: - - prefill energy = 600 × 8 × 10 = 48_000 J → J/input = 48_000 / 8000 = 6.0 - - decode energy = 400 × 8 × 10 = 32_000 J → J/output = 32_000 / 1000 = 32.0 - - total energy = 80_000 J → J/total = 80_000 / 9000 ≈ 8.889 - Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs.""" + - prefill energy = 600 × 8 × 10 = 48_000 J → J/input = 6.0 + - decode energy = 400 × 8 × 10 = 32_000 J → J/output_decode = 32.0 + - total energy = 80_000 J → cluster J/output = 80.0 + → cluster J/total ≈ 8.889 + Cluster-wide avg_power_w stays the weighted mean across all 16 GPUs. + The per-stage decode attribution is exposed as + `joules_per_output_token_decode` so the cluster-wide + `joules_per_output_token` stays comparable across topologies.""" base = 1_700_000_000.0 _write_nvidia_csv( tmp_path / "perf_samples_prefill_w0_pn0.csv", @@ -869,12 +876,19 @@ def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path): # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W. assert patched["avg_power_w"] == pytest.approx(500.0) - # Per-stage J/token: prefill energy / input, decode energy / output. + # Cluster-wide joules (total_system_energy / token_count) — same math as + # single-node so the metric stays comparable across topologies. + assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000) # 80.0 + assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 + + # Per-stage scalars (new): prefill_avg, decode_avg, J/input, J/output_decode. + assert patched["prefill_avg_power_w"] == pytest.approx(600.0) + assert patched["decode_avg_power_w"] == pytest.approx(400.0) assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 - assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000) # 32.0 - assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 + assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000) # 32.0 - workers = patched["power_by_worker"] + # workers[] (renamed from power_by_worker). + workers = patched["workers"] assert [w["role"] for w in workers] == ["prefill", "prefill", "decode"] assert [w["worker_idx"] for w in workers] == [0, 1, 0] # Decode_w0 collapsed across 2 hosts → 8 GPUs total. @@ -890,11 +904,13 @@ def test_run_disagg_emits_power_by_worker_and_per_stage_joules(tmp_path: Path): def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path): - """A frontend-only node's power must not contribute to J/input or J/output. + """A frontend-only node's power must not contribute to per-stage scalars. Frontend nodes don't run any backend worker — their (typically near-idle) GPU draw would skew per-stage attribution if counted. They still appear - in power_by_worker for observability.""" + in workers[] for observability, and they DO contribute to the cluster-wide + avg_power_w / joules_per_*_token totals (which describe the whole + deployment's energy).""" base = 1_700_000_000.0 # Prefill worker — 4 GPUs @ 600W → 24_000 J in 10s _write_nvidia_csv( @@ -906,7 +922,8 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path): tmp_path / "perf_samples_decode_w0_dn0.csv", [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(4)], ) - # Frontend node — would erroneously add 4_000 J if counted. + # Frontend node — would erroneously bleed into per-stage scalars if counted, + # but DOES count toward cluster avg/joules (it's still energy consumed). _write_nvidia_csv( tmp_path / "perf_samples_frontend_w0_head.csv", [(base + 1 + s, gpu, 100.0) for s in range(8) for gpu in range(4)], @@ -922,21 +939,32 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path): assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) - # J/input = 24_000 / 8000 = 3.0 (frontend excluded). + # Per-stage scalars (frontend excluded). + # J/input = 24_000 / 8000 = 3.0. assert patched["joules_per_input_token"] == pytest.approx(3.0) - # J/output = 16_000 / 1000 = 16.0 (frontend excluded). - assert patched["joules_per_output_token"] == pytest.approx(16.0) + # J/output_decode = 16_000 / 1000 = 16.0. + assert patched["joules_per_output_token_decode"] == pytest.approx(16.0) + assert patched["prefill_avg_power_w"] == pytest.approx(600.0) + assert patched["decode_avg_power_w"] == pytest.approx(400.0) + + # Cluster-wide J/output still uses TOTAL energy (incl. frontend). + # total energy = (600+400+100) × 4 × 10 = 44_000 J → 44.0 J/output_tok. + assert patched["joules_per_output_token"] == pytest.approx(44.0) + # Frontend still appears in the worker list for observability. - roles = [w["role"] for w in patched["power_by_worker"]] + roles = [w["role"] for w in patched["workers"]] assert "frontend" in roles -def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path): +def test_run_non_disagg_omits_per_stage_scalars(tmp_path: Path): """Non-disagg runs (single-node or multinode-agg) keep the legacy schema. - No joules_per_input_token field — it'd be meaningless without a prefill - stage to attribute energy to. Existing fields must keep their pre-disagg - semantics (total_system_energy / token_count).""" + No per-stage scalars (prefill_avg_power_w / decode_avg_power_w / + joules_per_input_token / joules_per_output_token_decode) and no workers[] + field — all of those need disagg + role-labeled CSVs to be meaningful. + + Existing fields must keep their pre-disagg semantics + (total_system_energy / token_count).""" base = 1_700_000_000.0 csv = tmp_path / "gpu_metrics.csv" _write_nvidia_csv( @@ -951,8 +979,15 @@ def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path): assert run(csv, bench, agg, disagg=False) == 0 patched = json.loads(agg.read_text()) - assert "joules_per_input_token" not in patched - assert "power_by_worker" not in patched + for absent in ( + "joules_per_input_token", + "joules_per_output_token_decode", + "prefill_avg_power_w", + "decode_avg_power_w", + "workers", + "power_by_worker", # the old name must NOT leak through either + ): + assert absent not in patched, f"unexpected key {absent} in non-disagg output" # Legacy semantics: total energy / token count. assert patched["joules_per_output_token"] == pytest.approx(2.0) assert patched["joules_per_total_token"] == pytest.approx(2.0) @@ -960,8 +995,8 @@ def test_run_non_disagg_omits_joules_per_input_token(tmp_path: Path): def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path: Path): """If only prefill or only decode CSVs survived, per-stage attribution - isn't possible — must fall back to cluster-wide ratios so the run still - publishes something useful instead of dropping the field entirely.""" + isn't possible — the per-stage scalars are omitted but cluster-wide ratios + are still published so the run isn't telemetry-blank.""" base = 1_700_000_000.0 # Only prefill CSVs — decode is missing entirely. _write_nvidia_csv( @@ -977,17 +1012,24 @@ def test_run_disagg_falls_back_to_cluster_when_only_one_stage_present(tmp_path: assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) - # power_by_worker still emitted (one prefill worker). - assert len(patched["power_by_worker"]) == 1 - # J/input absent (no per-stage attribution possible). - assert "joules_per_input_token" not in patched - # J/output falls back to cluster-wide (total_energy / output_tokens). + # workers[] still emitted (one prefill worker, useful for observability). + assert len(patched["workers"]) == 1 + # Per-stage scalars absent (no decode stage to attribute to). + for absent in ( + "joules_per_input_token", + "joules_per_output_token_decode", + "prefill_avg_power_w", + "decode_avg_power_w", + ): + assert absent not in patched, f"unexpected per-stage key {absent}" + # Cluster-wide J/output still emitted (total_energy / output_tokens). assert patched["joules_per_output_token"] == pytest.approx(24_000 / 1000) def test_run_disagg_handles_zero_input_tokens(tmp_path: Path): """total_input_tokens=0 (rare degenerate case) → joules_per_input_token - omitted, no ZeroDivisionError.""" + omitted, no ZeroDivisionError. Per-stage decode + per-stage power scalars + still emitted (those don't depend on input tokens).""" base = 1_700_000_000.0 _write_nvidia_csv( tmp_path / "perf_samples_prefill_w0_pn0.csv", @@ -1007,10 +1049,15 @@ def test_run_disagg_handles_zero_input_tokens(tmp_path: Path): assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) assert "joules_per_input_token" not in patched - assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000) + # Per-stage decode still works — depends only on decode_energy / output. + assert patched["joules_per_output_token_decode"] == pytest.approx(16_000 / 1000) + assert patched["prefill_avg_power_w"] == pytest.approx(600.0) + assert patched["decode_avg_power_w"] == pytest.approx(400.0) + # Cluster-wide J/output uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J. + assert patched["joules_per_output_token"] == pytest.approx(40_000 / 1000) -def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path): +def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path): """patch_agg_result emits the new optional fields when supplied.""" agg = tmp_path / "agg.json" agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") @@ -1021,15 +1068,24 @@ def test_patch_agg_result_with_per_worker_and_per_stage(tmp_path: Path): patch_agg_result( agg, avg_power_w=500.0, - joules_per_output_token=16.0, + joules_per_output_token=40.0, joules_per_total_token=4.44, joules_per_input_token=3.0, - power_by_worker=workers, + joules_per_output_token_decode=16.0, + prefill_avg_power_w=600.0, + decode_avg_power_w=400.0, + workers=workers, ) data = json.loads(agg.read_text()) assert data["avg_power_w"] == 500.0 + assert data["joules_per_output_token"] == 40.0 assert data["joules_per_input_token"] == 3.0 - assert data["power_by_worker"] == workers + assert data["joules_per_output_token_decode"] == 16.0 + assert data["prefill_avg_power_w"] == 600.0 + assert data["decode_avg_power_w"] == 400.0 + assert data["workers"] == workers + # power_by_worker (old name) must NOT appear. + assert "power_by_worker" not in data def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path): @@ -1043,5 +1099,550 @@ def test_patch_agg_result_omits_optional_fields_when_none(tmp_path: Path): joules_per_total_token=0.5, ) data = json.loads(agg.read_text()) - assert "joules_per_input_token" not in data - assert "power_by_worker" not in data + for absent in ( + "joules_per_input_token", + "joules_per_output_token_decode", + "prefill_avg_power_w", + "decode_avg_power_w", + "avg_temp_c", + "peak_temp_c", + "avg_util_pct", + "avg_mem_used_mb", + "workers", + "power_by_worker", + ): + assert absent not in data, f"unexpected key {absent} in minimal patch" + + +# --------------------------------------------------------------------------- # +# Telemetry: temperature, utilization, memory +# +# These extend aggregate_metrics()'s capability beyond power. Frontend already +# wires avg_temp_c / avg_util_pct / avg_mem_used_mb / peak_temp_c as scalar +# numerics (same convention as avg_power_w: per-GPU mean, unit-suffixed name). +# Power remains required for aggregation to fire; the others degrade gracefully. +# --------------------------------------------------------------------------- # + + +def _write_csv_with_metrics( + path: Path, + samples: list[tuple[float, int, dict[str, float]]], + *, + columns: tuple[str, ...] = ("power.draw [W]", "temperature.gpu", "utilization.gpu", "memory.used [MiB]"), + column_map: dict[str, str] | None = None, +) -> None: + """Write a CSV with arbitrary metric columns. + + samples: list of (epoch_seconds, gpu_index, {metric_key: value}). The + metric_key in the dict must match one of: 'power', 'temp', 'util', 'mem'. + The columns parameter is the literal CSV header for those metrics, in order. + column_map maps each metric_key → its position in `columns` (default: assume + same order as ('power', 'temp', 'util', 'mem') for an NVIDIA-style header). + """ + if column_map is None: + column_map = {"power": columns[0], "temp": columns[1], "util": columns[2], "mem": columns[3]} + header = "timestamp, index, " + ", ".join(columns) + lines = [header] + for ts, idx, vals in samples: + row = [_nvidia_ts(ts), str(idx)] + for col in columns: + metric_key = next((k for k, v in column_map.items() if v == col), None) + v = vals.get(metric_key) + if v is None: + row.append("[N/A]") + elif col == columns[0]: # power + row.append(f"{v:.2f} W") + elif "temp" in col.lower(): + row.append(f"{int(v)} C") + elif "util" in col.lower(): + row.append(f"{int(v)} %") + elif "mem" in col.lower(): + row.append(f"{int(v)} MiB") + else: + row.append(str(v)) + lines.append(", ".join(row)) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + +def test_detect_all_columns_nvidia(): + """NVIDIA header has all four metrics — each maps to its canonical column.""" + header = ["timestamp", "index", "power.draw [W]", "temperature.gpu", + "utilization.gpu", "memory.used [MiB]"] + cols = _detect_all_columns(header) + assert cols["timestamp"] == "timestamp" + assert cols["gpu"] == "index" + assert cols["power"] == "power.draw [W]" + assert cols["temp"] == "temperature.gpu" + assert cols["util"] == "utilization.gpu" + assert cols["mem"] == "memory.used [MiB]" + + +def test_detect_all_columns_srt_slurm_style(): + """srt-slurm perfmon uses bare-name columns: power_w, temp_c, util_pct, mem_used_mb.""" + header = ["timestamp", "gpu", "power_w", "temp_c", "util_pct", "mem_used_mb"] + cols = _detect_all_columns(header) + assert cols["power"] == "power_w" + assert cols["temp"] == "temp_c" + assert cols["util"] == "util_pct" + assert cols["mem"] == "mem_used_mb" + + +def test_detect_all_columns_amd_style(): + """AMD amd-smi uses different conventions: socket_power, temperature.""" + header = ["timestamp", "gpu", "socket_power", "temperature"] + cols = _detect_all_columns(header) + assert cols["power"] == "socket_power" + assert cols["temp"] == "temperature" + # No util/mem in this header — gracefully None. + assert cols["util"] is None + assert cols["mem"] is None + + +def test_detect_all_columns_excludes_memory_total(): + """memory.total must not be picked as the memory column (we want USED memory).""" + header = ["timestamp", "index", "power.draw [W]", "memory.total [MiB]", "memory.used [MiB]"] + cols = _detect_all_columns(header) + assert cols["mem"] == "memory.used [MiB]" + + +def test_detect_all_columns_missing_optional_metrics(): + """Only power present — temp/util/mem all None.""" + header = ["timestamp", "index", "power.draw [W]"] + cols = _detect_all_columns(header) + assert cols["power"] == "power.draw [W]" + assert cols["temp"] is None + assert cols["util"] is None + assert cols["mem"] is None + + +def test_aggregate_metrics_returns_all_telemetry_single_node(tmp_path: Path): + """Cluster-wide aggregation captures power, temp, util, mem in one pass.""" + csv = tmp_path / "gpu_metrics.csv" + base = 1_700_000_000.0 + # 4 GPUs, 3 samples — uniform values per metric. + samples = [] + for s in range(3): + for gpu in range(4): + samples.append( + (base + s, gpu, {"power": 500.0, "temp": 70.0, "util": 95.0, "mem": 60000.0}) + ) + _write_csv_with_metrics(csv, samples) + result = aggregate_metrics(csv, base, base + 10) + assert result is not None + assert result["num_gpus"] == 4 + assert result["power"] == pytest.approx(500.0) + assert result["temp"] == pytest.approx(70.0) + assert result["util"] == pytest.approx(95.0) + assert result["mem"] == pytest.approx(60000.0) + assert result["peak_temp"] == pytest.approx(70.0) # uniform → peak == avg + + +def test_aggregate_metrics_peak_temp_is_max_not_mean(tmp_path: Path): + """peak_temp_c is the global max instantaneous reading, not a per-bucket mean. + + Critical for thermal-headroom signals: a single GPU hitting 85C during the + run matters even if the cluster mean stays at 70C.""" + csv = tmp_path / "gpu_metrics.csv" + base = 1_700_000_000.0 + samples = [] + # 4 GPUs at 70C steadily, EXCEPT one GPU spikes to 85C in the middle sample. + for s in range(3): + for gpu in range(4): + temp = 85.0 if (s == 1 and gpu == 2) else 70.0 + samples.append((base + s, gpu, {"power": 500.0, "temp": temp})) + _write_csv_with_metrics( + csv, samples, + columns=("power.draw [W]", "temperature.gpu"), + column_map={"power": "power.draw [W]", "temp": "temperature.gpu"}, + ) + result = aggregate_metrics(csv, base, base + 10) + assert result is not None + # Mean is dominated by the 11 readings at 70 + 1 at 85 = (11*70 + 85)/12 ≈ 71.25. + assert result["temp"] == pytest.approx((11 * 70 + 85) / 12, abs=0.01) + # Peak is the raw max sample, not any averaged value. + assert result["peak_temp"] == pytest.approx(85.0) + + +def test_aggregate_metrics_missing_temp_column_omits_temp(tmp_path: Path): + """A CSV without a temp column → result dict has no 'temp' / 'peak_temp' keys. + + Graceful degradation: callers using .get() / 'temp' in result handle this + naturally.""" + csv = tmp_path / "gpu_metrics.csv" + base = 1_700_000_000.0 + # Header has ONLY power. + samples = [(base + s, gpu, {"power": 500.0}) for s in range(3) for gpu in range(4)] + _write_csv_with_metrics( + csv, samples, + columns=("power.draw [W]",), + column_map={"power": "power.draw [W]"}, + ) + result = aggregate_metrics(csv, base, base + 10) + assert result is not None + assert result["power"] == pytest.approx(500.0) + assert "temp" not in result + assert "peak_temp" not in result + assert "util" not in result + assert "mem" not in result + + +def test_aggregate_metrics_missing_util_only_keeps_others(tmp_path: Path): + """Power + temp + mem present but no util column → util omitted, rest fine. + + Mirrors the AMD case where amd-smi output may lack a utilization column.""" + csv = tmp_path / "gpu_metrics.csv" + base = 1_700_000_000.0 + samples = [ + (base + s, gpu, {"power": 500.0, "temp": 70.0, "mem": 60000.0}) + for s in range(3) for gpu in range(4) + ] + _write_csv_with_metrics( + csv, samples, + columns=("power.draw [W]", "temperature.gpu", "memory.used [MiB]"), + column_map={"power": "power.draw [W]", "temp": "temperature.gpu", "mem": "memory.used [MiB]"}, + ) + result = aggregate_metrics(csv, base, base + 10) + assert result is not None + assert "util" not in result + assert result["temp"] == pytest.approx(70.0) + assert result["mem"] == pytest.approx(60000.0) + + +def test_aggregate_metrics_multinode_aggregates_across_csvs(tmp_path: Path): + """Multinode telemetry rolls up across per-node CSVs same as power. + + Per-GPU mean is weighted by the (per-sample, per-namespace) GPU count.""" + base = 1_700_000_000.0 + node1 = tmp_path / "perf_samples_node1.csv" + node2 = tmp_path / "perf_samples_node2.csv" + _write_csv_with_metrics( + node1, + [(base + s, gpu, {"power": 600.0, "temp": 75.0, "util": 95.0, "mem": 60000.0}) + for s in range(3) for gpu in range(4)], + ) + _write_csv_with_metrics( + node2, + [(base + s, gpu, {"power": 400.0, "temp": 65.0, "util": 85.0, "mem": 40000.0}) + for s in range(3) for gpu in range(4)], + ) + result = aggregate_metrics([node1, node2], base, base + 10) + assert result is not None + assert result["num_gpus"] == 8 + # All metrics are weighted means across the 8 distinct GPUs. + assert result["power"] == pytest.approx(500.0) # (600+400)/2 + assert result["temp"] == pytest.approx(70.0) # (75+65)/2 + assert result["util"] == pytest.approx(90.0) + assert result["mem"] == pytest.approx(50000.0) + assert result["peak_temp"] == pytest.approx(75.0) + + +def test_run_patches_cluster_wide_temp_util_mem(tmp_path: Path): + """End-to-end: run() patches cluster-wide telemetry into the agg JSON + when the CSV exposes the corresponding columns.""" + base = 1_700_000_000.0 + csv = tmp_path / "gpu_metrics.csv" + samples = [ + (base + 1 + s, gpu, {"power": 500.0, "temp": 70.0, "util": 95.0, "mem": 60000.0}) + for s in range(2) for gpu in range(8) + ] + _write_csv_with_metrics(csv, samples) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000) + agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8") + + assert run(csv, bench, agg) == 0 + patched = json.loads(agg.read_text()) + # Power baseline still works. + assert patched["avg_power_w"] == pytest.approx(500.0) + # New cluster-wide scalars present and rounded to 3 decimals. + assert patched["avg_temp_c"] == pytest.approx(70.0) + assert patched["peak_temp_c"] == pytest.approx(70.0) + assert patched["avg_util_pct"] == pytest.approx(95.0) + assert patched["avg_mem_used_mb"] == pytest.approx(60000.0) + + +def test_run_omits_cluster_telemetry_when_csv_has_no_extra_columns(tmp_path: Path): + """Power-only CSV → only avg_power_w + joules_per_*_token are emitted. + + Backward compat with old CSVs / older monitoring setups that only captured + power. The agg JSON must not gain spurious null/zero values for the + metrics the CSV didn't carry.""" + base = 1_700_000_000.0 + csv = tmp_path / "gpu_metrics.csv" + # Old NVIDIA CSV without temp/util/mem — the _write_nvidia_csv helper + # already includes temperature though. So use the metric helper with only power. + samples = [(base + 1 + s, gpu, {"power": 500.0}) for s in range(2) for gpu in range(8)] + _write_csv_with_metrics( + csv, samples, + columns=("power.draw [W]",), + column_map={"power": "power.draw [W]"}, + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result(bench, start=base, end=base + 10, duration=10.0, total_output=20_000) + agg.write_text(json.dumps({"hw": "h200"}), encoding="utf-8") + + assert run(csv, bench, agg) == 0 + patched = json.loads(agg.read_text()) + assert patched["avg_power_w"] == pytest.approx(500.0) + for absent in ("avg_temp_c", "peak_temp_c", "avg_util_pct", "avg_mem_used_mb"): + assert absent not in patched, f"unexpected {absent} when CSV lacks that column" + + +def test_run_disagg_emits_per_worker_temp_util_mem(tmp_path: Path): + """Disagg multinode: each entry in workers[] carries per-worker telemetry + in addition to avg_power_w. Frontend can render thermal/util breakdown + by worker role.""" + base = 1_700_000_000.0 + # Prefill worker runs hotter (compute-bound) than decode (memory-bound). + _write_csv_with_metrics( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, {"power": 600.0, "temp": 80.0, "util": 98.0, "mem": 50000.0}) + for s in range(8) for gpu in range(4)], + ) + _write_csv_with_metrics( + tmp_path / "perf_samples_decode_w0_dn0.csv", + [(base + 1 + s, gpu, {"power": 400.0, "temp": 65.0, "util": 70.0, "mem": 70000.0}) + for s in range(8) for gpu in range(4)], + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, + total_output=1000, total_input=8000, + ) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + + # Cluster-wide telemetry: weighted mean across all 8 GPUs. + assert patched["avg_temp_c"] == pytest.approx(72.5) # (80+65)/2 + assert patched["peak_temp_c"] == pytest.approx(80.0) + assert patched["avg_util_pct"] == pytest.approx(84.0) # (98+70)/2 + assert patched["avg_mem_used_mb"] == pytest.approx(60000.0) + + workers = patched["workers"] + prefill = next(w for w in workers if w["role"] == "prefill") + decode = next(w for w in workers if w["role"] == "decode") + # Per-worker fields present alongside avg_power_w. + assert prefill["avg_temp_c"] == pytest.approx(80.0) + assert prefill["peak_temp_c"] == pytest.approx(80.0) + assert prefill["avg_util_pct"] == pytest.approx(98.0) + assert prefill["avg_mem_used_mb"] == pytest.approx(50000.0) + assert decode["avg_temp_c"] == pytest.approx(65.0) + assert decode["avg_util_pct"] == pytest.approx(70.0) + assert decode["avg_mem_used_mb"] == pytest.approx(70000.0) + + +def test_run_per_worker_omits_missing_telemetry_columns(tmp_path: Path): + """If a worker's CSV lacks a temp/util/mem column, those keys are + omitted from that worker's entry — no nulls leak through.""" + base = 1_700_000_000.0 + # Prefill: full schema (power + temp + util + mem). + _write_csv_with_metrics( + tmp_path / "perf_samples_prefill_w0_pn0.csv", + [(base + 1 + s, gpu, {"power": 600.0, "temp": 80.0, "util": 98.0, "mem": 50000.0}) + for s in range(8) for gpu in range(4)], + ) + # Decode: power only — no other columns at all in its CSV. + _write_csv_with_metrics( + tmp_path / "perf_samples_decode_w0_dn0.csv", + [(base + 1 + s, gpu, {"power": 400.0}) for s in range(8) for gpu in range(4)], + columns=("power.draw [W]",), + column_map={"power": "power.draw [W]"}, + ) + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, + total_output=1000, total_input=8000, + ) + agg.write_text(json.dumps({"hw": "gb300"}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + workers = patched["workers"] + decode = next(w for w in workers if w["role"] == "decode") + # Decode worker has avg_power_w but none of the optional telemetry fields. + assert decode["avg_power_w"] == pytest.approx(400.0) + for absent in ("avg_temp_c", "peak_temp_c", "avg_util_pct", "avg_mem_used_mb"): + assert absent not in decode, f"unexpected {absent} on power-only decode worker" + # Prefill still has all of them. + prefill = next(w for w in workers if w["role"] == "prefill") + assert "avg_temp_c" in prefill + assert "avg_util_pct" in prefill + assert "avg_mem_used_mb" in prefill + + +# --------------------------------------------------------------------------- # +# AMD multi-node disaggregated inference (mi355x) +# +# The AMD path has no srt-slurm orchestrator: each SGLang/vLLM disagg node +# starts its own amd-smi monitor via start_perf_monitor (benchmarks/ +# benchmark_lib.sh), writing perf_samples__w_.csv in the SAME +# convention as the NVIDIA perfmon. These tests lock in that the (vendor- +# agnostic) aggregation produces the full per-worker / per-stage schema when +# fed amd-smi CSVs — ISO timestamps, bare-numeric power, "gpu"/"socket_power" +# columns — over realistic MI355X (8 GPUs/node) disagg topologies and AMD +# cluster hostnames. The NVIDIA-CSV tests above already cover the math; these +# guard the AMD CSV format + filename round-trip end to end. +# --------------------------------------------------------------------------- # + + +def test_parse_perfmon_label_amd_hostname(): + """AMD mi355x cluster hostnames (e.g. mia1-p01-g09) round-trip cleanly. + + start_perf_monitor builds the filename from `hostname -s` sanitized with + `tr -c 'A-Za-z0-9.-' '_'`; AMD short hostnames are already alnum+dash, so + the host segment survives intact through _parse_perfmon_label.""" + assert _parse_perfmon_label( + Path("perf_samples_prefill_w0_mia1-p01-g09.csv") + ) == ("prefill", 0, "mia1-p01-g09") + assert _parse_perfmon_label( + Path("perf_samples_decode_w2_smci355-ccs-aus-12.csv") + ) == ("decode", 2, "smci355-ccs-aus-12") + + +def test_aggregate_power_by_worker_amd_one_csv_per_worker(tmp_path: Path): + """AMD amd-smi CSVs, one prefill + one decode worker, 8 GPUs/node (MI355X). + + Same grouping logic as the NVIDIA case, but proves the amd-smi CSV schema + (ISO timestamp, bare power, 'gpu' index col) parses through the per-worker + rollup.""" + base = 1_700_000_000.0 + _write_amd_csv( + tmp_path / "perf_samples_prefill_w0_mia1-p01-g01.csv", + [(base + s, gpu, 600.0) for s in range(3) for gpu in range(8)], + ) + _write_amd_csv( + tmp_path / "perf_samples_decode_w0_mia1-p01-g02.csv", + [(base + s, gpu, 400.0) for s in range(3) for gpu in range(8)], + ) + + workers = aggregate_power_by_worker( + list(tmp_path.glob("perf_samples_*.csv")), base, base + 10 + ) + assert workers is not None + assert [w["role"] for w in workers] == ["prefill", "decode"] + assert [w["worker_idx"] for w in workers] == [0, 0] + assert workers[0]["num_gpus"] == 8 + assert workers[0]["avg_power_w"] == pytest.approx(600.0) + assert workers[0]["hosts"] == ["mia1-p01-g01"] + assert workers[1]["num_gpus"] == 8 + assert workers[1]["avg_power_w"] == pytest.approx(400.0) + + +def test_aggregate_power_by_worker_amd_worker_spans_multiple_nodes(tmp_path: Path): + """A single decode worker spanning 2 MI355X nodes (DECODE_TP_SIZE=16). + + Both node-CSVs share (decode, w0); amd-smi reports local indices 0..7 on + each, so without per-source namespacing the union would collapse to 8 + instead of 16. Mirrors the SGLang DECODE_NODES_PER_WORKER>1 topology.""" + base = 1_700_000_000.0 + hosts = ["mia1-p01-g05", "mia1-p01-g06"] + for h in hosts: + _write_amd_csv( + tmp_path / f"perf_samples_decode_w0_{h}.csv", + [(base + s, gpu, 400.0) for s in range(3) for gpu in range(8)], + ) + + workers = aggregate_power_by_worker( + list(tmp_path.glob("perf_samples_*.csv")), base, base + 10 + ) + assert workers is not None + assert len(workers) == 1 + w = workers[0] + assert w["role"] == "decode" + assert w["worker_idx"] == 0 + assert w["num_gpus"] == 16 # 2 nodes × 8 GPUs + assert w["avg_power_w"] == pytest.approx(400.0) + assert w["hosts"] == sorted(hosts) + + +def test_run_disagg_amd_emits_workers_and_per_stage_joules(tmp_path: Path): + """Full AMD mi355x disagg pipeline end to end with amd-smi CSVs. + + Topology: 1 prefill worker × 8 GPUs @ 600W, 1 decode worker × 8 GPUs @ 400W. + Over a 10s window with 8000 input + 1000 output tokens: + - prefill energy = 600 × 8 × 10 = 48_000 J → J/input = 6.0 + - decode energy = 400 × 8 × 10 = 32_000 J → J/output_decode = 32.0 + - total energy = 80_000 J → cluster J/output = 80.0 + - cluster avg = (8×600 + 8×400)/16 = 500W + This is the AMD analogue of test_run_disagg_emits_workers_and_per_stage_joules.""" + base = 1_700_000_000.0 + _write_amd_csv( + tmp_path / "perf_samples_prefill_w0_mia1-p01-g01.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(8)], + ) + _write_amd_csv( + tmp_path / "perf_samples_decode_w0_mia1-p01-g02.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(8)], + ) + + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, + total_output=1000, total_input=8000, + ) + agg.write_text(json.dumps({"hw": "mi355x", "disagg": True}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + + # Cluster-wide (vendor-agnostic, same math as single-node / NVIDIA). + assert patched["avg_power_w"] == pytest.approx(500.0) + assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000) # 80.0 + assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 + + # Per-stage scalars from amd-smi CSVs. + assert patched["prefill_avg_power_w"] == pytest.approx(600.0) + assert patched["decode_avg_power_w"] == pytest.approx(400.0) + assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 + assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000) # 32.0 + + # workers[] breakdown. + workers = patched["workers"] + assert [w["role"] for w in workers] == ["prefill", "decode"] + assert all(w["num_gpus"] == 8 for w in workers) + + +def test_run_disagg_amd_vllm_topology_one_worker_per_node(tmp_path: Path): + """vLLM AMD topology: xP=2 prefill + yD=2 decode, one worker per node. + + server_vllm.sh labels ranks [0,xP) prefill (w=rank) and [xP, xP+yD) decode + (w=rank-xP). Four amd-smi CSVs, distinct worker indices per stage.""" + base = 1_700_000_000.0 + for w in range(2): + _write_amd_csv( + tmp_path / f"perf_samples_prefill_w{w}_mia1-p02-g0{w}.csv", + [(base + 1 + s, gpu, 600.0) for s in range(8) for gpu in range(8)], + ) + for w in range(2): + _write_amd_csv( + tmp_path / f"perf_samples_decode_w{w}_mia1-p02-g1{w}.csv", + [(base + 1 + s, gpu, 400.0) for s in range(8) for gpu in range(8)], + ) + + bench = tmp_path / "bench.json" + agg = tmp_path / "agg.json" + _write_bench_result( + bench, start=base, end=base + 10, duration=10.0, + total_output=1000, total_input=8000, + ) + agg.write_text(json.dumps({"hw": "mi355x", "disagg": True}), encoding="utf-8") + + assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 + patched = json.loads(agg.read_text()) + + workers = patched["workers"] + assert [w["role"] for w in workers] == ["prefill", "prefill", "decode", "decode"] + assert [w["worker_idx"] for w in workers] == [0, 1, 0, 1] + # 2 prefill workers × 8 GPUs @ 600W → 96_000 J / 8000 input = 12.0. + assert patched["joules_per_input_token"] == pytest.approx(96_000 / 8000) + # 2 decode workers × 8 GPUs @ 400W → 64_000 J / 1000 output = 64.0. + assert patched["joules_per_output_token_decode"] == pytest.approx(64_000 / 1000) + assert patched["prefill_avg_power_w"] == pytest.approx(600.0) + assert patched["decode_avg_power_w"] == pytest.approx(400.0) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index 6b3fc9a94..ad931591b 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -755,14 +755,14 @@ def test_multinode_csv_glob_empty_match_falls_through_silently(self, tmp_path, s patched = json.loads(agg_path.read_text()) assert "avg_power_w" not in patched - def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path, multinode_env_vars): + def test_disagg_multinode_emits_workers_and_per_stage_joules(self, tmp_path, multinode_env_vars): """End-to-end disagg wiring: DISAGG=true + per-node labeled CSVs → process_result.py passes disagg through to aggregate_power, which emits - power_by_worker + joules_per_input_token using per-stage attribution. + workers[] + per-stage scalars alongside the cluster-wide joules. - Without the disagg=disagg propagation in process_result.py, the run - would silently fall back to cluster-wide joules math and the user-facing - per-stage J/input metric would be missing.""" + Without the disagg=disagg propagation in process_result.py, the + per-stage scalars (joules_per_input_token, joules_per_output_token_decode, + prefill_avg_power_w, decode_avg_power_w) would be missing.""" start, end = 1_700_000_100.0, 1_700_000_160.0 # 60s bench window # 1 prefill worker × 4 GPUs @ 600W on its own node self._write_nvidia_csv( @@ -796,21 +796,28 @@ def test_disagg_multinode_emits_per_worker_and_per_stage_joules(self, tmp_path, patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text()) - # Per-stage attribution: prefill_energy / input, decode_energy / output. + # Per-stage attribution scalars: prefill_energy / input, decode_energy / output. # Prefill: 600 × 4 × 60 = 144_000 J → / 240_000 = 0.6 J/input_tok. - # Decode: 400 × 4 × 60 = 96_000 J → / 30_000 = 3.2 J/output_tok. + # Decode: 400 × 4 × 60 = 96_000 J → / 30_000 = 3.2 J/output_tok_decode. assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01) - assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01) + assert patched["joules_per_output_token_decode"] == pytest.approx(3.2, abs=0.01) + assert patched["prefill_avg_power_w"] == pytest.approx(600.0, abs=0.5) + assert patched["decode_avg_power_w"] == pytest.approx(400.0, abs=0.5) + + # Cluster-wide J/output (frontend would be incl. here too if present). + # Total energy = (600+400) × 4 × 60 = 240_000 J → / 30_000 = 8.0 J/output_tok. + assert patched["joules_per_output_token"] == pytest.approx(8.0, abs=0.05) # Per-worker breakdown labeled with role. - workers = patched["power_by_worker"] + workers = patched["workers"] assert {w["role"] for w in workers} == {"prefill", "decode"} for w in workers: assert w["num_gpus"] == 4 assert w["worker_idx"] == 0 def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, multinode_env_vars): - """Multinode but DISAGG=false → keep cluster-wide ratios, no J/input. + """Multinode but DISAGG=false → keep cluster-wide ratios, no per-stage + scalars. Sanity check that the disagg flag is the gate, not just multinode-ness.""" start, end = 1_700_000_100.0, 1_700_000_160.0 @@ -841,7 +848,13 @@ def test_non_disagg_multinode_keeps_cluster_wide_joules_math(self, tmp_path, mul assert result.returncode == 0, f"Script failed: {result.stderr}" patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text()) - assert "joules_per_input_token" not in patched - # power_by_worker still emitted (filename labels exist) — useful for + for absent in ( + "joules_per_input_token", + "joules_per_output_token_decode", + "prefill_avg_power_w", + "decode_avg_power_w", + ): + assert absent not in patched, f"unexpected per-stage key {absent}" + # workers[] still emitted (filename labels exist) — useful for # observability even on non-disagg runs. - assert patched["power_by_worker"][0]["role"] == "agg" + assert patched["workers"][0]["role"] == "agg" From f407f4b4d653a517db91da5c3607a5016dfae7d5 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 13:30:59 -0700 Subject: [PATCH 09/14] feat(power): AMD multi-node measured-power telemetry (mi355x disagg) Mirror the NVIDIA gb300/srt-slurm measured-power path on the AMD multi-node disaggregated inference path. With no orchestrator perfmon, each SGLang/vLLM disagg node starts its own amd-smi monitor via start_perf_monitor (benchmark_lib.sh), writing perf_samples__w_.csv into the NFS-shared /benchmark_logs/perfmon mount; launch_mi355x-amds.sh collects them and exports GPU_METRICS_CSV_GLOB so the existing vendor-agnostic utils/aggregate_power.py produces per-worker + per-stage power. AMD perfmon wiring: - benchmark_lib.sh: start_perf_monitor helper; case-insensitive amd-smi header filter; log captured CSV header for schema-mismatch visibility - amd_utils/job.slurm: PERFMON_OUTPUT_DIR + interval into each container - amd_utils/server_sglang.sh / server_vllm.sh: per-node role + worker-idx classification (matches each engine's own placement); monitor start + stop on every exit path - runners/launch_mi355x-amds.sh: collect per-node CSVs immediately after job completion (before result-processing early-exits / EXIT-trap wipe), export GPU_METRICS_CSV_GLOB - utils/aggregate_power.py: docstring documents the AMD source (logic already vendor-agnostic) - utils/test_aggregate_power.py: AMD amd-smi multinode tests (per-worker, per-stage J/token, multi-node-per-worker collapse, vLLM topology) - perf-changelog.yaml: trigger the 6 mi355x disagg sweeps (sglang+vllm) Also lands the concurrent per-metric telemetry extension in aggregate_power.py / tests: temp/util/mem aggregation, workers[] schema, and flat per-stage scalars (prefill_avg_power_w, decode_avg_power_w, joules_per_input_token, joules_per_output_token_decode). Verified locally: 107 utils tests pass; bash syntax + shellcheck clean; role mapping + filename contract + full amd-smi->agg pipeline validated; adversarial review findings addressed (CSV collection moved ahead of early exits; case-insensitive amd-smi header). Co-Authored-By: Claude Opus 4.8 --- benchmarks/benchmark_lib.sh | 78 ++++++++++++++++++- benchmarks/multi_node/amd_utils/job.slurm | 12 +++ .../multi_node/amd_utils/server_sglang.sh | 28 +++++++ .../multi_node/amd_utils/server_vllm.sh | 26 +++++++ runners/launch_mi355x-amds.sh | 38 +++++++++ 5 files changed, 179 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 7dbbaaaa8..747b445c0 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -41,10 +41,18 @@ start_gpu_monitor() { GPU_MONITOR_PID=$! echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" elif command -v amd-smi &>/dev/null; then - # Use amd-smi native watch mode (-w) which includes timestamps automatically. - # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers. + # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage, + # -w native watch mode (emits a timestamp column per sample), + # --csv. The awk filter keeps the first CSV header line and drops + # amd-smi's preamble / repeated headers. Header match is case-insensitive + # (tolower) so a capitalized "Timestamp," header — should amd-smi ever + # emit one — still passes through; aggregate_power's column detection is + # case-insensitive too. NOTE: amd-smi timestamps are node-local wall + # clock, so multinode aggregation assumes cluster clocks are NTP-synced + # (same assumption as nvidia-smi; aggregate_power windows by absolute + # epoch from benchmark_serving.py). amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \ - | awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" & + | awk 'tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" & GPU_MONITOR_PID=$! echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" else @@ -63,11 +71,75 @@ stop_gpu_monitor() { local lines lines=$(wc -l < "$GPU_METRICS_CSV") echo "[GPU Monitor] Collected $lines rows -> $GPU_METRICS_CSV" + # Echo the captured header so a vendor-SMI schema mismatch (the one + # thing that silently yields 0 usable power samples downstream) is + # visible in CI logs without re-running on hardware. + echo "[GPU Monitor] CSV header: $(head -1 "$GPU_METRICS_CSV" 2>/dev/null)" fi fi GPU_MONITOR_PID="" } +# Start a per-node GPU power monitor for multi-node disaggregated runs. +# +# This is the AMD/SGLang/vLLM analogue of NVIDIA srt-slurm's per-node perfmon +# (PR #35): there is no orchestrator to spawn nvidia-smi on each node, so each +# node starts its own amd-smi/nvidia-smi monitor here. The output filename +# encodes the worker role and index in exactly the format +# utils/aggregate_power.py's _parse_perfmon_label expects: +# +# perf_samples__w_.csv +# +# so the downstream aggregation can attribute energy per worker and (for disagg) +# per stage. role must be one of: prefill, decode, agg, frontend. +# +# Output goes to $PERFMON_OUTPUT_DIR, which job.slurm points at the NFS-shared +# /benchmark_logs/perfmon mount so every node's CSV lands in one directory the +# runner can collect. The monitor runs for the whole server lifetime; +# aggregate_power.py windows the samples down to each concurrency's benchmark +# load window using the timestamps benchmark_serving.py writes. +# +# Best-effort by design: an unset output dir, an unknown role, or a missing +# amd-smi/nvidia-smi is a no-op that returns 0 — a monitoring hiccup must never +# fail the benchmark. +# +# Usage: start_perf_monitor [interval_seconds] +start_perf_monitor() { + local role="$1" + local worker_idx="$2" + local interval="${3:-${PERFMON_SAMPLE_INTERVAL:-1}}" + + local out_dir="${PERFMON_OUTPUT_DIR:-}" + if [[ -z "$out_dir" ]]; then + echo "[perfmon] PERFMON_OUTPUT_DIR unset — skipping per-node power monitor" + return 0 + fi + case "$role" in + prefill|decode|agg|frontend) ;; + *) + echo "[perfmon] unknown role '$role' (expected prefill|decode|agg|frontend) — skipping monitor" + return 0 + ;; + esac + if ! mkdir -p "$out_dir" 2>/dev/null; then + echo "[perfmon] cannot create $out_dir — skipping per-node power monitor" + return 0 + fi + + # Sanitize the host component so the filename stays parseable by + # aggregate_power's regex (role/idx anchors are unambiguous, but keep the + # host free of separators that could confuse a future tightening). Prefer + # the short hostname; fall back to the FQDN. + local host + host=$(hostname -s 2>/dev/null || hostname) + host=$(printf '%s' "$host" | tr -c 'A-Za-z0-9.-' '_') + + local out="${out_dir}/perf_samples_${role}_w${worker_idx}_${host}.csv" + echo "[perfmon] starting per-node power monitor: role=$role worker=$worker_idx host=$host interval=${interval}s -> $out" + start_gpu_monitor --output "$out" --interval "$interval" + return 0 +} + # Check if required environment variables are set # Usage: check_env_vars VAR1 VAR2 VAR3 ... # Exits with code 1 if any variable is not set diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm index 5e8e67606..102953eb8 100755 --- a/benchmarks/multi_node/amd_utils/job.slurm +++ b/benchmarks/multi_node/amd_utils/job.slurm @@ -298,6 +298,16 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}" export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}" export ENGINE=$ENGINE +# Per-node measured-power monitoring. Each node's server script starts an +# amd-smi/nvidia-smi monitor (start_perf_monitor in benchmark_lib.sh) that +# writes perf_samples__w_.csv into PERFMON_OUTPUT_DIR. That +# dir is the /benchmark_logs/perfmon mount, which maps to BENCHMARK_LOGS_DIR +# on the (NFS-shared) host so every node's CSV lands in one place the runner +# can collect. Pre-create it on the host so the directory exists before any +# container writes to it. +export PERFMON_SAMPLE_INTERVAL="${PERFMON_SAMPLE_INTERVAL:-1}" +mkdir -p "${BENCHMARK_LOGS_DIR}/perfmon" 2>/dev/null || true + # Eval-related env vars (threaded from submit.sh) export RUN_EVAL="${RUN_EVAL:-false}" export EVAL_ONLY="${EVAL_ONLY:-false}" @@ -375,6 +385,8 @@ DOCKER_ENV_COMMON=( -e RUNNER_TYPE=\$RUNNER_TYPE -e RESULT_FILENAME=\$RESULT_FILENAME -e SPEC_DECODING=\$SPEC_DECODING + -e PERFMON_OUTPUT_DIR=/benchmark_logs/perfmon + -e PERFMON_SAMPLE_INTERVAL=\$PERFMON_SAMPLE_INTERVAL -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh index c28ccab41..1c1be4e47 100755 --- a/benchmarks/multi_node/amd_utils/server_sglang.sh +++ b/benchmarks/multi_node/amd_utils/server_sglang.sh @@ -48,6 +48,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}" # ============================================================================= source $SGLANG_WS_PATH/setup_deps.sh source $SGLANG_WS_PATH/env.sh +# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is +# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up. +source "$SGLANG_WS_PATH/../../benchmark_lib.sh" host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}') host_name=$(hostname) @@ -279,6 +282,27 @@ done echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}" echo "Decode worker headnode list: ${DECODE_HEADNODE_URLS[@]}" +# ============================================================================= +# Per-node measured-power monitor (best-effort) +# ============================================================================= +# Classify this node into the same worker buckets the role branches below use: +# NODE_RANK in [0, NODE_OFFSET) -> prefill, worker = NODE_RANK / PREFILL_NODES_PER_WORKER +# NODE_RANK >= NODE_OFFSET -> decode, worker = (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER +# (NODE_OFFSET = PREFILL_NODES_PER_WORKER * xP.) Node 0 is the proxy too, but +# its GPUs run the prefill head, so labeling it prefill attributes its energy +# to the right stage. The monitor runs for the whole server lifetime; +# aggregate_power.py windows the samples down to each concurrency's load window. +if [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then + PERF_ROLE="prefill" + PERF_WORKER_IDX=$(( NODE_RANK / PREFILL_NODES_PER_WORKER )) +else + PERF_ROLE="decode" + PERF_WORKER_IDX=$(( (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER )) +fi +if [[ "$DRY_RUN" -ne 1 ]]; then + start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX" +fi + # ============================================================================= # Configuration Builder Functions # ============================================================================= @@ -636,6 +660,7 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then echo "ERROR: eval failed; exiting node-0 with rc=1" + stop_gpu_monitor exit 1 fi @@ -777,5 +802,8 @@ else fi +# Stop the per-node power monitor and flush its CSV before the container exits. +stop_gpu_monitor + echo "Script completed successfully" exit 0 diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh index d61fe0359..4e032dd3e 100755 --- a/benchmarks/multi_node/amd_utils/server_vllm.sh +++ b/benchmarks/multi_node/amd_utils/server_vllm.sh @@ -50,6 +50,9 @@ MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}" # Dependencies and Environment Setup # ============================================================================= source $WS_PATH/env.sh +# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is +# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up. +source "$WS_PATH/../../benchmark_lib.sh" host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}') # RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available) @@ -214,6 +217,25 @@ done echo "Prefill node IPs: ${PREFILL_ARGS}" echo "Decode node IPs: ${DECODE_ARGS}" +# ============================================================================= +# Per-node measured-power monitor (best-effort) +# ============================================================================= +# vLLM places one worker per node: ranks [0, xP) are prefill (kv_producer), +# ranks [xP, xP+yD) are decode (kv_consumer) — see the role branches below. +# Node 0 is the proxy too, but its GPUs run the first prefill worker, so it is +# correctly labeled prefill. The monitor runs for the whole server lifetime; +# aggregate_power.py windows the samples down to each concurrency's load window. +if [ "$NODE_RANK" -lt "$xP" ]; then + PERF_ROLE="prefill" + PERF_WORKER_IDX=$NODE_RANK +else + PERF_ROLE="decode" + PERF_WORKER_IDX=$(( NODE_RANK - xP )) +fi +if [[ "$DRY_RUN" -ne 1 ]]; then + start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX" +fi + # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address) PROXY_PING_PORT="${PROXY_PING_PORT:-36367}" @@ -408,6 +430,7 @@ if [ "$NODE_RANK" -eq 0 ]; then if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then echo "ERROR: eval failed; exiting node-0 with rc=1" + stop_gpu_monitor exit 1 fi @@ -523,5 +546,8 @@ fi # kill $etcd_pid 2>/dev/null || true # pkill -f etcd 2>/dev/null || true +# Stop the per-node power monitor and flush its CSV before the container exits. +stop_gpu_monitor + echo "Script completed successfully" exit 0 diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 9b6cb96a9..b893efa84 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -117,6 +117,43 @@ if [[ "$IS_MULTINODE" == "true" ]]; then set -x + # ── Per-node measured-power CSVs ────────────────────────────────────── + # Collect these FIRST — immediately after the job completes and before the + # result-processing block below, which has early `exit 1` paths (e.g. no + # logs dir found). Any early exit fires the EXIT trap (cleanup_and_save_logs), + # which `sudo rm -rf`s the whole $BENCHMARK_LOGS_DIR — so anything that needs + # to survive must be copied out before then. This mirrors launch_gb300-cw.sh, + # which collects srt-slurm's perfmon CSVs right after the job completes. + # + # Each node's server script (server_sglang.sh / server_vllm.sh) wrote + # perf_samples__w_.csv into $BENCHMARK_LOGS_DIR/perfmon + # (NFS-shared, one file per node). Copy them into the GH workspace and point + # the downstream "Process result" step at them via GPU_METRICS_CSV_GLOB so + # utils/aggregate_power.py can do the multi-CSV per-worker / per-stage + # aggregation. Best-effort: a monitoring hiccup must never fail the upload. + PERFMON_SRC_DIR="$BENCHMARK_LOGS_DIR/perfmon" + if ls "$PERFMON_SRC_DIR"/perf_samples_*.csv >/dev/null 2>&1; then + PERFMON_DST_DIR="$GITHUB_WORKSPACE/perfmon" + mkdir -p "$PERFMON_DST_DIR" + cp "$PERFMON_SRC_DIR"/perf_samples_*.csv "$PERFMON_DST_DIR"/ 2>/dev/null \ + || sudo cp "$PERFMON_SRC_DIR"/perf_samples_*.csv "$PERFMON_DST_DIR"/ 2>/dev/null \ + || true + # CSVs may be root-owned on NFS (containers run as root); make them + # readable by the runner user for the Process result step. + sudo chown -R "$(id -u):$(id -g)" "$PERFMON_DST_DIR" 2>/dev/null || true + perf_csv_count=$(ls "$PERFMON_DST_DIR"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ') + if [ "$perf_csv_count" -gt 0 ]; then + echo "[perfmon] Collected $perf_csv_count per-node perf_samples_*.csv -> $PERFMON_DST_DIR" + if [ -n "${GITHUB_ENV:-}" ]; then + echo "GPU_METRICS_CSV_GLOB=$PERFMON_DST_DIR/perf_samples_*.csv" >> "$GITHUB_ENV" + fi + else + echo "[perfmon] WARNING: perf_samples_*.csv present under $PERFMON_SRC_DIR but none copied to $PERFMON_DST_DIR — measured power aggregation will be skipped" + fi + else + echo "[perfmon] No perf_samples_*.csv found under $PERFMON_SRC_DIR — measured power aggregation will be skipped" + fi + # FIXME: The below is bad and is a result of the indirection of the ways in which # Dynamo jobs are launched. In a follow-up PR, the location of the result file should not # depend on the runner, it should always be in the same spot in the GH workspace. @@ -182,6 +219,7 @@ PY fi echo "All result files processed" + # Use sync scancel to ensure nfs file handle is released in time set +x scancel_sync $JOB_ID From dea49cd9745c2899fbc0cf0baa8ed96dcc8d53e9 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 14:32:20 -0700 Subject: [PATCH 10/14] fix(power): address bot review findings on agg telemetry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _MEM_EXCLUDE_RE now excludes "clock" and "util" (not just "total"), so nvidia-smi's clocks.current.memory (a frequency) and utilization.memory (a percent) are no longer mislabeled as avg_mem_used_mb. (cursor[bot] Medium) - Remove dead _disagg_stage_energies shim — no callers. (cursor[bot] Low) - Add regression test: mem detection ignores clock/util memory columns. 108 utils tests pass. Co-Authored-By: Claude Opus 4.8 --- utils/aggregate_power.py | 23 ++++++++++------------- utils/test_aggregate_power.py | 18 ++++++++++++++++++ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index ee4327b83..0e3c6bc60 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -65,8 +65,10 @@ "temperature". srt-slurm: "temp_c". Unit: Celsius. - Utilization: column name starts with "utilization" or contains "util". NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent. - - Memory: column name contains "mem" but not "total" (avoid "memory.total"). - NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB. + - Memory: column name contains "mem" but not "total"/"clock"/"util" — so + "memory.total", "clocks.current.memory" (a frequency), and + "utilization.memory" (a percent) are all rejected; only memory *used* is + picked. NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB. Power is required for aggregation to fire; the other metrics degrade gracefully when their columns are absent (those fields are simply omitted from the output). @@ -94,7 +96,12 @@ _TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE) _UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE) _MEM_COL_RE = re.compile(r"mem", re.IGNORECASE) -_MEM_EXCLUDE_RE = re.compile(r"total", re.IGNORECASE) +# Exclude "total" (memory.total), "clock" (clocks.current.memory — a frequency, +# not memory used), and "util" (utilization.memory — a percent). nvidia-smi's +# query emits clocks.current.memory BEFORE any used-memory column, so without +# these excludes _MEM_COL_RE would grab the memory *clock* (~2500 MHz) as +# avg_mem_used_mb. +_MEM_EXCLUDE_RE = re.compile(r"total|clock|util", re.IGNORECASE) _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE) _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE) _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?") @@ -690,16 +697,6 @@ def _disagg_stage_rollup( } -# Backward-compat shim — the original API returned just the two energy values. -def _disagg_stage_energies( - workers: list[dict], duration: float -) -> tuple[float, float] | None: - res = _disagg_stage_rollup(workers, duration) - if res is None: - return None - return res["prefill_energy_j"], res["decode_energy_j"] - - def run( csv_path: Path | Iterable[Path], bench_result: Path, diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index fb33ea265..bccf1a98e 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -1205,6 +1205,24 @@ def test_detect_all_columns_excludes_memory_total(): assert cols["mem"] == "memory.used [MiB]" +def test_detect_all_columns_mem_ignores_clock_and_util_memory(): + """The real nvidia-smi query has NO used-memory column — only + clocks.current.memory (a frequency) and utilization.memory (a percent), + both of which contain "mem". Neither is memory *used*, so the mem column + must resolve to None rather than mislabeling the memory clock as + avg_mem_used_mb. Regression for the r"mem" over-match.""" + header = [ + "timestamp", "index", "power.draw [W]", "temperature.gpu", + "clocks.current.sm [MHz]", "clocks.current.memory [MHz]", + "utilization.gpu [%]", "utilization.memory [%]", + ] + cols = _detect_all_columns(header) + assert cols["mem"] is None, f"mem should be None, got {cols['mem']!r}" + # The real used-memory column, when present, is still picked. + cols2 = _detect_all_columns(header + ["memory.used [MiB]"]) + assert cols2["mem"] == "memory.used [MiB]" + + def test_detect_all_columns_missing_optional_metrics(): """Only power present — temp/util/mem all None.""" header = ["timestamp", "index", "power.draw [W]"] From 1135f67d319a38b4fe31607681b0d9fd8f1fbf1a Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 15:02:26 -0700 Subject: [PATCH 11/14] feat(power): joules_per_output_token = per-stage decode for disagg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per reviewer: in disagg serving, attribute each token type to only its stage's GPUs — input tokens to prefill GPUs, output tokens to decode GPUs (symmetric). joules_per_output_token is now decode_energy / output_tokens for disagg (was cluster-wide); joules_per_input_token already used prefill energy / input_tokens. joules_per_total_token stays cluster-wide (overall efficiency). Single-node / non-disagg / single-stage keep the cluster-wide output ratio so the field is always populated. Removes the now-redundant joules_per_output_token_decode key (folded into joules_per_output_token). Docstring, CLI help, and tests updated; 108 pass. Co-Authored-By: Claude Opus 4.8 --- utils/aggregate_power.py | 91 ++++++++++++++++------------------- utils/test_aggregate_power.py | 47 +++++++++--------- utils/test_process_result.py | 13 ++--- 3 files changed, 70 insertions(+), 81 deletions(-) diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index 0e3c6bc60..5fb08ef44 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -9,12 +9,12 @@ Cluster-wide fields (always written when any power data exists): - avg_power_w: mean per-GPU power draw (W) during the load window - - joules_per_output_token: total_system_energy / total_output_tokens - (cluster-wide; always — same math single-node and - multinode disagg, so the metric stays comparable - across topologies in the dashboard) + - joules_per_output_token: energy / total_output_tokens. CLUSTER-WIDE + (total_system_energy) on single-node / non-disagg; + OVERRIDDEN to per-stage decode_energy for disagg + (see below). - joules_per_total_token: total_system_energy / (input + output) tokens - (cluster-wide; always) + (cluster-wide; always — overall efficiency number) - avg_temp_c: mean per-GPU temperature (Celsius), when the CSV exposes a temperature column - peak_temp_c: max instantaneous per-GPU temperature in window @@ -22,22 +22,18 @@ - avg_mem_used_mb: mean per-GPU memory used (MiB/MB) For disaggregated multinode runs (DISAGG=true) where filenames carry the perfmon -role/index encoding AND both prefill+decode workers are present, additional flat -per-stage scalars are emitted alongside (NOT instead of) the cluster-wide keys: - - - prefill_avg_power_w: per-GPU mean power across prefill workers - - decode_avg_power_w: per-GPU mean power across decode workers - - joules_per_input_token: prefill_energy / total_input_tokens - (per-stage attribution — prefill processes - input tokens, so its energy / input gives the - prefill-side per-token cost) - - joules_per_output_token_decode: decode_energy / total_output_tokens - (per-stage attribution; the _decode suffix is - load-bearing — keeps the cluster-wide - joules_per_output_token comparable across - single-node and disagg deployments and exposes - decode-only energy as a separate key for users - who specifically want it.) +role/index encoding AND both prefill+decode workers are present, the per-token +energy metrics use PER-STAGE attribution — each token type is divided by only the +GPUs of the stage that produces it (the standard disagg-serving convention): + + - joules_per_input_token: prefill_energy / total_input_tokens — input tokens + are processed by the prefill GPUs only. + - joules_per_output_token: decode_energy / total_output_tokens — output tokens + are produced by the decode GPUs only. (For + single-node / non-disagg this stays the cluster-wide + total_system_energy / output_tokens.) + - prefill_avg_power_w: per-GPU mean power across prefill workers + - decode_avg_power_w: per-GPU mean power across decode workers Per-worker breakdown (multinode only — single-node has no role concept), emitted under the `workers` key to match InferenceX-app's BenchmarkRow.workers shape: @@ -607,7 +603,6 @@ def patch_agg_result( joules_per_output_token: float, joules_per_total_token: float, joules_per_input_token: float | None = None, - joules_per_output_token_decode: float | None = None, prefill_avg_power_w: float | None = None, decode_avg_power_w: float | None = None, avg_temp_c: float | None = None, @@ -629,8 +624,6 @@ def patch_agg_result( data["joules_per_total_token"] = round(joules_per_total_token, 6) if joules_per_input_token is not None: data["joules_per_input_token"] = round(joules_per_input_token, 6) - if joules_per_output_token_decode is not None: - data["joules_per_output_token_decode"] = round(joules_per_output_token_decode, 6) if prefill_avg_power_w is not None: data["prefill_avg_power_w"] = round(prefill_avg_power_w, 3) if decode_avg_power_w is not None: @@ -735,40 +728,39 @@ def run( # parse, so aggregate_power_by_worker returns None and the field is omitted. workers = aggregate_power_by_worker(paths, start, end) - # Cluster-wide energy + per-token attribution. We ALWAYS report - # joules_per_output_token / joules_per_total_token as cluster-wide ratios - # (total_system_energy / token_count), regardless of disagg. This keeps the - # metric comparable across single-node, multinode-agg, and multinode-disagg - # topologies in the dashboard. Per-stage attribution lives in separate - # *_decode / joules_per_input_token keys (only emitted when disagg AND both - # stages present). + # Per-token energy attribution. + # - joules_per_total_token stays CLUSTER-WIDE on every topology + # (total_system_energy / all tokens) — the overall efficiency number. + # - For disagg with BOTH stages present, joules_per_output_token and + # joules_per_input_token use PER-STAGE energy: output tokens are produced + # by the decode GPUs (decode_energy / output), input tokens by the + # prefill GPUs (prefill_energy / input). This is the standard per-stage + # attribution requested for disagg serving. + # - Single-node / non-disagg / single-stage fall back to the cluster-wide + # output ratio so the field is always populated. total_system_energy_j = avg_power_w * num_gpus * duration total_tokens = total_output + total_input - joules_per_output_token = total_system_energy_j / total_output + joules_per_output_token = total_system_energy_j / total_output # cluster fallback joules_per_total_token = ( total_system_energy_j / total_tokens if total_tokens > 0 else joules_per_output_token ) joules_per_input_token: float | None = None - joules_per_output_token_decode: float | None = None prefill_avg_power_w: float | None = None decode_avg_power_w: float | None = None if disagg and workers is not None: stage = _disagg_stage_rollup(workers, duration) if stage is not None: - # Per-stage attribution: prefill workers process input tokens, - # decode workers process output tokens. Strictly more accurate - # than total-energy ratios when prefill/decode have different - # per-GPU power profiles (typical: prefill is compute-bound and - # draws more than memory-bound decode). Exposed as additional - # flat scalars so the cluster-wide joules_per_output_token stays - # comparable across topologies. + # Per-stage attribution: decode GPUs produce output tokens, prefill + # GPUs process input tokens. Strictly more accurate than total-energy + # ratios when prefill/decode have different per-GPU power profiles + # (typical: prefill is compute-bound and draws more than memory-bound + # decode). joules_per_output_token is OVERRIDDEN to the decode-only + # value here (symmetric with the prefill-only joules_per_input_token). prefill_avg_power_w = stage["prefill_avg_power_w"] decode_avg_power_w = stage["decode_avg_power_w"] - joules_per_output_token_decode = ( - stage["decode_energy_j"] / total_output - ) + joules_per_output_token = stage["decode_energy_j"] / total_output joules_per_input_token = ( stage["prefill_energy_j"] / total_input if total_input > 0 else None ) @@ -787,7 +779,6 @@ def run( joules_per_output_token, joules_per_total_token, joules_per_input_token=joules_per_input_token, - joules_per_output_token_decode=joules_per_output_token_decode, prefill_avg_power_w=prefill_avg_power_w, decode_avg_power_w=decode_avg_power_w, avg_temp_c=avg_temp_c, @@ -851,12 +842,12 @@ def main() -> int: parser.add_argument( "--disagg", action="store_true", - help="Treat as disaggregated inference: emit prefill_avg_power_w, " - "decode_avg_power_w, joules_per_input_token, and " - "joules_per_output_token_decode using per-stage energy attribution " - "(prefill workers' energy / input tokens, decode workers' energy / " - "output tokens). Requires CSV filenames to carry the perfmon role/index " - "encoding.", + help="Treat as disaggregated inference: emit prefill_avg_power_w / " + "decode_avg_power_w, and use PER-STAGE energy attribution for " + "joules_per_input_token (prefill energy / input tokens) and " + "joules_per_output_token (decode energy / output tokens). " + "joules_per_total_token stays cluster-wide. Requires CSV filenames to " + "carry the perfmon role/index encoding.", ) args = parser.parse_args() diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index bccf1a98e..578981d29 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -876,16 +876,16 @@ def test_run_disagg_emits_workers_and_per_stage_joules(tmp_path: Path): # Cluster-wide avg = (8*600 + 8*400) / 16 = 500W. assert patched["avg_power_w"] == pytest.approx(500.0) - # Cluster-wide joules (total_system_energy / token_count) — same math as - # single-node so the metric stays comparable across topologies. - assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000) # 80.0 + # joules_per_total_token stays cluster-wide (all energy / all tokens). assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 - # Per-stage scalars (new): prefill_avg, decode_avg, J/input, J/output_decode. + # Per-stage attribution: input divided by prefill energy, output by decode + # energy (the disagg convention). assert patched["prefill_avg_power_w"] == pytest.approx(600.0) assert patched["decode_avg_power_w"] == pytest.approx(400.0) - assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 - assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000) # 32.0 + assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 (prefill) + assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000) # 32.0 (decode) + assert "joules_per_output_token_decode" not in patched # folded into joules_per_output_token # workers[] (renamed from power_by_worker). workers = patched["workers"] @@ -939,17 +939,17 @@ def test_run_disagg_excludes_frontend_from_per_stage_energy(tmp_path: Path): assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) - # Per-stage scalars (frontend excluded). - # J/input = 24_000 / 8000 = 3.0. + # Per-stage attribution excludes the frontend node. + # J/input = prefill 24_000 / 8000 = 3.0. + # J/output = decode 16_000 / 1000 = 16.0 (frontend's 4_000 J NOT counted). assert patched["joules_per_input_token"] == pytest.approx(3.0) - # J/output_decode = 16_000 / 1000 = 16.0. - assert patched["joules_per_output_token_decode"] == pytest.approx(16.0) + assert patched["joules_per_output_token"] == pytest.approx(16.0) assert patched["prefill_avg_power_w"] == pytest.approx(600.0) assert patched["decode_avg_power_w"] == pytest.approx(400.0) - # Cluster-wide J/output still uses TOTAL energy (incl. frontend). - # total energy = (600+400+100) × 4 × 10 = 44_000 J → 44.0 J/output_tok. - assert patched["joules_per_output_token"] == pytest.approx(44.0) + # But the frontend's energy IS counted in the cluster-wide total efficiency: + # total energy = (600+400+100) × 4 × 10 = 44_000 J → / 9000 tokens ≈ 4.889. + assert patched["joules_per_total_token"] == pytest.approx(44_000 / 9000) # Frontend still appears in the worker list for observability. roles = [w["role"] for w in patched["workers"]] @@ -1049,12 +1049,12 @@ def test_run_disagg_handles_zero_input_tokens(tmp_path: Path): assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) assert "joules_per_input_token" not in patched - # Per-stage decode still works — depends only on decode_energy / output. - assert patched["joules_per_output_token_decode"] == pytest.approx(16_000 / 1000) + # Per-stage output still works — depends only on decode_energy / output. + assert patched["joules_per_output_token"] == pytest.approx(16_000 / 1000) # decode assert patched["prefill_avg_power_w"] == pytest.approx(600.0) assert patched["decode_avg_power_w"] == pytest.approx(400.0) - # Cluster-wide J/output uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J. - assert patched["joules_per_output_token"] == pytest.approx(40_000 / 1000) + # Cluster-wide total uses TOTAL energy. (600+400) × 4 × 10 = 40_000 J / 1000. + assert patched["joules_per_total_token"] == pytest.approx(40_000 / 1000) def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path): @@ -1071,7 +1071,6 @@ def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path): joules_per_output_token=40.0, joules_per_total_token=4.44, joules_per_input_token=3.0, - joules_per_output_token_decode=16.0, prefill_avg_power_w=600.0, decode_avg_power_w=400.0, workers=workers, @@ -1080,7 +1079,6 @@ def test_patch_agg_result_with_workers_and_per_stage(tmp_path: Path): assert data["avg_power_w"] == 500.0 assert data["joules_per_output_token"] == 40.0 assert data["joules_per_input_token"] == 3.0 - assert data["joules_per_output_token_decode"] == 16.0 assert data["prefill_avg_power_w"] == 600.0 assert data["decode_avg_power_w"] == 400.0 assert data["workers"] == workers @@ -1610,16 +1608,15 @@ def test_run_disagg_amd_emits_workers_and_per_stage_joules(tmp_path: Path): assert run(list(tmp_path.glob("perf_samples_*.csv")), bench, agg, disagg=True) == 0 patched = json.loads(agg.read_text()) - # Cluster-wide (vendor-agnostic, same math as single-node / NVIDIA). + # Cluster-wide total (vendor-agnostic, same math as single-node / NVIDIA). assert patched["avg_power_w"] == pytest.approx(500.0) - assert patched["joules_per_output_token"] == pytest.approx(80_000 / 1000) # 80.0 assert patched["joules_per_total_token"] == pytest.approx(80_000 / 9000) # ≈ 8.889 - # Per-stage scalars from amd-smi CSVs. + # Per-stage attribution from amd-smi CSVs: input=prefill energy, output=decode energy. assert patched["prefill_avg_power_w"] == pytest.approx(600.0) assert patched["decode_avg_power_w"] == pytest.approx(400.0) - assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 - assert patched["joules_per_output_token_decode"] == pytest.approx(32_000 / 1000) # 32.0 + assert patched["joules_per_input_token"] == pytest.approx(48_000 / 8000) # 6.0 (prefill) + assert patched["joules_per_output_token"] == pytest.approx(32_000 / 1000) # 32.0 (decode) # workers[] breakdown. workers = patched["workers"] @@ -1661,6 +1658,6 @@ def test_run_disagg_amd_vllm_topology_one_worker_per_node(tmp_path: Path): # 2 prefill workers × 8 GPUs @ 600W → 96_000 J / 8000 input = 12.0. assert patched["joules_per_input_token"] == pytest.approx(96_000 / 8000) # 2 decode workers × 8 GPUs @ 400W → 64_000 J / 1000 output = 64.0. - assert patched["joules_per_output_token_decode"] == pytest.approx(64_000 / 1000) + assert patched["joules_per_output_token"] == pytest.approx(64_000 / 1000) assert patched["prefill_avg_power_w"] == pytest.approx(600.0) assert patched["decode_avg_power_w"] == pytest.approx(400.0) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index ad931591b..78f293fd0 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -796,17 +796,18 @@ def test_disagg_multinode_emits_workers_and_per_stage_joules(self, tmp_path, mul patched = json.loads((tmp_path / "agg_benchmark_result.json").read_text()) - # Per-stage attribution scalars: prefill_energy / input, decode_energy / output. + # Per-stage attribution: input divided by prefill energy, output by decode. # Prefill: 600 × 4 × 60 = 144_000 J → / 240_000 = 0.6 J/input_tok. - # Decode: 400 × 4 × 60 = 96_000 J → / 30_000 = 3.2 J/output_tok_decode. + # Decode: 400 × 4 × 60 = 96_000 J → / 30_000 = 3.2 J/output_tok. assert patched["joules_per_input_token"] == pytest.approx(0.6, abs=0.01) - assert patched["joules_per_output_token_decode"] == pytest.approx(3.2, abs=0.01) + assert patched["joules_per_output_token"] == pytest.approx(3.2, abs=0.01) # decode + assert "joules_per_output_token_decode" not in patched assert patched["prefill_avg_power_w"] == pytest.approx(600.0, abs=0.5) assert patched["decode_avg_power_w"] == pytest.approx(400.0, abs=0.5) - # Cluster-wide J/output (frontend would be incl. here too if present). - # Total energy = (600+400) × 4 × 60 = 240_000 J → / 30_000 = 8.0 J/output_tok. - assert patched["joules_per_output_token"] == pytest.approx(8.0, abs=0.05) + # Cluster-wide total efficiency still counts ALL energy. + # Total energy = (600+400) × 4 × 60 = 240_000 J → / 270_000 ≈ 0.889 J/total_tok. + assert patched["joules_per_total_token"] == pytest.approx(240_000 / 270_000, abs=0.01) # Per-worker breakdown labeled with role. workers = patched["workers"] From 30f1c2131b5481928559c5755ffdba3c06c1b508 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 15:13:53 -0700 Subject: [PATCH 12/14] ci(multinode): drop root-owned benchmark_logs before checkout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AMD multinode container runs as root and writes benchmark_logs/. If a job is cancelled (e.g. concurrency supersede), its cleanup trap never runs, leaving root-owned dirs. actions/checkout (clean: true) then can't rmdir them (EACCES) and fails BEFORE the job starts — poison-failing every job scheduled onto that runner. Add `sudo rm -rf $GITHUB_WORKSPACE/benchmark_logs` to the shared Slurm-cleanup anchor (runs pre-checkout AND post-run) so a dirty runner self-heals. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/benchmark-multinode-tmpl.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index f901b1ff7..b2eec27e6 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -178,6 +178,13 @@ jobs: sleep 5 done fi + # Drop root-owned leftovers from a prior (often cancelled) multinode + # run. The benchmark container runs as root and writes benchmark_logs/; + # if the job was cancelled its cleanup trap never ran, leaving + # root-owned dirs that actions/checkout (clean: true) can't rmdir + # (EACCES) — which then poison-fails EVERY subsequent job on that + # runner. Runs in both pre- and post-run cleanup (shared anchor). + sudo rm -rf "${GITHUB_WORKSPACE}/benchmark_logs" 2>/dev/null || true - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: From 488ce46d1f1cd75553749541481bf1d2706eece5 Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 15:38:51 -0700 Subject: [PATCH 13/14] fix(launcher): tolerate transient squeue timeouts in mi355x job poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A full sweep floods slurmctld, so `squeue` intermittently returns "slurm_load_jobs error: Socket timed out". The old liveness check (`! squeue ... | grep -q $JOB_ID`) treated that empty/failed output as "job died" and exit 1'd — a false failure on a healthy job (observed on dsr1-fp8-mi355x-sglang-disagg conc 1024x2048). Add job_alive(): a non-zero squeue exit is treated as "still alive" (don't false-fail on a scheduler blip); only a SUCCESSFUL squeue that omits the job — re-checked once to avoid a single-sample race — counts as gone. Used by both the wait-for-log loop and the completion poll. Co-Authored-By: Claude Opus 4.8 --- runners/launch_mi355x-amds.sh | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index b893efa84..cc4544962 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -90,11 +90,27 @@ if [[ "$IS_MULTINODE" == "true" ]]; then # Give slurm time to start the job and create log file sleep 10 + # Whether $JOB_ID is still in the SLURM queue, resilient to transient + # slurmctld timeouts ("slurm_load_jobs error: Socket timed out") — common + # when a full sweep floods the controller. A FAILED squeue (non-zero exit) + # is treated as "still alive" so a scheduler blip can't be misread as job + # death; only a SUCCESSFUL squeue that omits the job means it's gone, and we + # re-check once before declaring it gone to avoid a single-sample race. + job_alive() { + local out rc + out=$(squeue -u "$USER" --noheader --format='%i' 2>/dev/null); rc=$? + [[ $rc -ne 0 ]] && return 0 # scheduler hiccup → assume alive + grep -qw "$JOB_ID" <<<"$out" && return 0 + sleep 5 + out=$(squeue -u "$USER" --noheader --format='%i' 2>/dev/null) || return 0 + grep -qw "$JOB_ID" <<<"$out" + } + # Wait for log file to appear (also check job is still alive) while ! ls "$LOG_FILE" &>/dev/null; do - if ! squeue -u "$USER" --noheader --format='%i' | grep -q "$JOB_ID"; then - echo "ERROR: Job $JOB_ID failed before creating log file" - scontrol show job "$JOB_ID" + if ! job_alive; then + echo "ERROR: Job $JOB_ID is no longer in the queue and never created a log file" + scontrol show job "$JOB_ID" 2>/dev/null || true exit 1 fi sleep 5 @@ -102,9 +118,10 @@ if [[ "$IS_MULTINODE" == "true" ]]; then set +x - # Poll for job completion in background + # Poll for job completion in background (tolerant of transient squeue + # timeouts via job_alive — a scheduler blip must not look like completion). ( - while squeue -u $USER --noheader --format='%i' | grep -q "$JOB_ID"; do + while job_alive; do sleep 10 done ) & From 6849229d67ac0a42cf30e68226c57ce0256ac2ce Mon Sep 17 00:00:00 2001 From: Aryan Date: Thu, 28 May 2026 15:54:46 -0700 Subject: [PATCH 14/14] feat(power): capture AMD temp/util/mem (gfx_activity, used_vram, hotspot) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The amd-smi monitor only ran `metric -p -c -t -u`, so no VRAM column was emitted and avg_mem_used_mb never populated on AMD. It also used util/mem column matchers tuned for NVIDIA/srt-slurm names, which miss amd-smi's conventions — so avg_util_pct and avg_temp_c silently dropped too. - benchmark_lib.sh: add `-m` (mem-usage) to the amd-smi command so a used_vram column is captured. - aggregate_power.py column detection: - util: also match amd-smi `gfx_activity` (umc/mm_activity excluded). - mem: match positively on memory/vram + "used" instead of broad "mem" minus a growing exclude list — picks memory.used / mem_used_mb / used_vram while rejecting mem_temperature, mem_voltage, total/free_vram, the memory clock, and utilization.memory. - temp: prefer hotspot/junction over the first temp column, since edge temperature reads N/A on data-center AMD parts (MI300/MI355). NVIDIA and srt-slurm detection is unchanged (verified by existing tests). Adds AMD-header detection tests; full suite 111 passed. Co-Authored-By: Claude Opus 4.8 --- benchmarks/benchmark_lib.sh | 10 ++++--- utils/aggregate_power.py | 56 ++++++++++++++++++++++------------- utils/test_aggregate_power.py | 40 +++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 24 deletions(-) diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 747b445c0..2c0d881a1 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -41,9 +41,11 @@ start_gpu_monitor() { GPU_MONITOR_PID=$! echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" elif command -v amd-smi &>/dev/null; then - # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage, - # -w native watch mode (emits a timestamp column per sample), - # --csv. The awk filter keeps the first CSV header line and drops + # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage + # (gfx_activity), -m mem-usage (used_vram), -w native watch + # mode (emits a timestamp column per sample), --csv. Without -m there is + # no VRAM column, so avg_mem_used_mb would never populate on AMD. + # The awk filter keeps the first CSV header line and drops # amd-smi's preamble / repeated headers. Header match is case-insensitive # (tolower) so a capitalized "Timestamp," header — should amd-smi ever # emit one — still passes through; aggregate_power's column detection is @@ -51,7 +53,7 @@ start_gpu_monitor() { # clock, so multinode aggregation assumes cluster clocks are NTP-synced # (same assumption as nvidia-smi; aggregate_power windows by absolute # epoch from benchmark_serving.py). - amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \ + amd-smi metric -p -c -t -u -m -w "$interval" --csv 2>/dev/null \ | awk 'tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" & GPU_MONITOR_PID=$! echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)" diff --git a/utils/aggregate_power.py b/utils/aggregate_power.py index 5fb08ef44..9efcf2fd0 100644 --- a/utils/aggregate_power.py +++ b/utils/aggregate_power.py @@ -57,14 +57,18 @@ - Power: timestamp + column whose name contains "power" (excluding "limit"/"cap"/"max"/"min"). NVIDIA: "power.draw [W]". AMD: "socket_power". srt-slurm: "power_w". - - Temperature: column name contains "temp". NVIDIA: "temperature.gpu". AMD: - "temperature". srt-slurm: "temp_c". Unit: Celsius. - - Utilization: column name starts with "utilization" or contains "util". + - Temperature: column name contains "temp"; hotspot/junction columns are + preferred over the first match because data-center AMD parts report edge + temperature as N/A. NVIDIA: "temperature.gpu". AMD amd-smi: "edge_temperature" + / "hotspot_temperature" (junction picked). srt-slurm: "temp_c". Unit: Celsius. + - Utilization: column starts with "utilization" or contains "util", or is + amd-smi's "gfx_activity" (umc_activity / mm_activity are not matched). NVIDIA: "utilization.gpu". srt-slurm: "util_pct". Unit: percent. - - Memory: column name contains "mem" but not "total"/"clock"/"util" — so - "memory.total", "clocks.current.memory" (a frequency), and - "utilization.memory" (a percent) are all rejected; only memory *used* is - picked. NVIDIA: "memory.used [MiB]". srt-slurm: "mem_used_mb". Unit: MiB/MB. + - Memory used: column mentions memory/vram AND "used" — picks NVIDIA + "memory.used [MiB]", srt-slurm "mem_used_mb", amd-smi "used_vram"; rejects + decoys lacking "used" (memory.total / total_vram / free_vram, the memory + *clock* "clocks.current.memory", utilization.memory, mem_temperature, + mem_voltage). Unit: MiB/MB. Power is required for aggregation to fire; the other metrics degrade gracefully when their columns are absent (those fields are simply omitted from the output). @@ -90,14 +94,24 @@ _POWER_COL_RE = re.compile(r"power", re.IGNORECASE) _POWER_EXCLUDE_RE = re.compile(r"limit|cap|max|min", re.IGNORECASE) _TEMP_COL_RE = re.compile(r"temp", re.IGNORECASE) -_UTIL_COL_RE = re.compile(r"^utilization|util", re.IGNORECASE) -_MEM_COL_RE = re.compile(r"mem", re.IGNORECASE) -# Exclude "total" (memory.total), "clock" (clocks.current.memory — a frequency, -# not memory used), and "util" (utilization.memory — a percent). nvidia-smi's -# query emits clocks.current.memory BEFORE any used-memory column, so without -# these excludes _MEM_COL_RE would grab the memory *clock* (~2500 MHz) as -# avg_mem_used_mb. -_MEM_EXCLUDE_RE = re.compile(r"total|clock|util", re.IGNORECASE) +# Data-center AMD parts (MI300/MI355) report edge temperature as N/A and expose +# the real die temperature as hotspot/junction; prefer those when present so +# avg_temp_c isn't computed over an all-N/A edge column. NVIDIA's single +# "temperature.gpu" and srt-slurm's "temp_c" have neither token and fall through +# to the first temperature column unchanged. +_TEMP_PREFER_RE = re.compile(r"hotspot|junction", re.IGNORECASE) +# Utilization: NVIDIA "utilization.gpu", srt-slurm "util_pct", AMD amd-smi +# "gfx_activity" (the GPU/graphics-engine busy percent). amd-smi's other usage +# columns — umc_activity (memory controller), mm_activity (multimedia) — are +# intentionally NOT matched so gfx_activity is the one picked. +_UTIL_COL_RE = re.compile(r"^utilization|util|gfx_activity", re.IGNORECASE) +# Memory *used*: match positively on a column that mentions both memory/vram and +# "used" rather than broad "mem" + a growing exclude list. This naturally picks +# NVIDIA "memory.used [MiB]", srt-slurm "mem_used_mb", and amd-smi "used_vram" +# while rejecting same-prefix decoys that lack "used": memory.total / total_vram / +# free_vram, clocks.current.memory (a frequency), utilization.memory (a percent), +# and amd-smi's mem_temperature / mem_voltage. +_MEM_COL_RE = re.compile(r"(?:mem|vram).*used|used.*(?:mem|vram)", re.IGNORECASE) _TIMESTAMP_COL_RE = re.compile(r"time", re.IGNORECASE) _GPU_INDEX_COL_RE = re.compile(r"^(index|gpu|gpu_id|gpu_index|card|device)$", re.IGNORECASE) _NUMBER_RE = re.compile(r"-?\d+(?:\.\d+)?") @@ -208,12 +222,14 @@ def _detect_all_columns(header: list[str]) -> dict[str, str | None]: (c for c in header if _POWER_COL_RE.search(c) and not _POWER_EXCLUDE_RE.search(c)), None, ) - temp_col = next((c for c in header if _TEMP_COL_RE.search(c)), None) - util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None) - mem_col = next( - (c for c in header if _MEM_COL_RE.search(c) and not _MEM_EXCLUDE_RE.search(c)), - None, + temp_cols = [c for c in header if _TEMP_COL_RE.search(c)] + # Prefer hotspot/junction (the real die temp on data-center AMD parts) over + # the first temperature column (edge on AMD, temperature.gpu on NVIDIA). + temp_col = next((c for c in temp_cols if _TEMP_PREFER_RE.search(c)), None) or ( + temp_cols[0] if temp_cols else None ) + util_col = next((c for c in header if _UTIL_COL_RE.search(c)), None) + mem_col = next((c for c in header if _MEM_COL_RE.search(c)), None) gpu_col = next((c for c in header if _GPU_INDEX_COL_RE.match(c.strip())), None) return { "timestamp": timestamp_col, diff --git a/utils/test_aggregate_power.py b/utils/test_aggregate_power.py index 578981d29..47ea5a452 100644 --- a/utils/test_aggregate_power.py +++ b/utils/test_aggregate_power.py @@ -1196,6 +1196,46 @@ def test_detect_all_columns_amd_style(): assert cols["mem"] is None +def test_detect_all_columns_amd_smi_full(): + """Real amd-smi `metric -p -c -t -u -m --csv` header on a data-center part. + + Exercises: gfx_activity as util (not umc/mm_activity), used_vram as mem (not + total/free_vram, mem_clock, mem_voltage, or mem_temperature), and + hotspot_temperature preferred over the N/A edge_temperature. + """ + header = [ + "timestamp", "gpu", + "socket_power", "mem_voltage", # -p + "gfx_clock", "mem_clock", # -c + "edge_temperature", "hotspot_temperature", "mem_temperature", # -t + "gfx_activity", "umc_activity", "mm_activity", # -u + "total_vram", "used_vram", "free_vram", # -m + ] + cols = _detect_all_columns(header) + assert cols["power"] == "socket_power" + assert cols["util"] == "gfx_activity" + assert cols["mem"] == "used_vram" + # Hotspot/junction preferred over edge (edge reads N/A on MI300/MI355). + assert cols["temp"] == "hotspot_temperature" + + +def test_detect_all_columns_temp_prefers_junction(): + """junction_temperature wins over a leading edge_temperature column.""" + header = ["timestamp", "gpu", "socket_power", + "edge_temperature", "junction_temperature"] + assert _detect_all_columns(header)["temp"] == "junction_temperature" + + +def test_detect_all_columns_mem_vram_used_variants(): + """Both used_vram and vram_used resolve; total/free_vram never do.""" + assert _detect_all_columns( + ["timestamp", "power_w", "total_vram", "vram_used", "free_vram"] + )["mem"] == "vram_used" + assert _detect_all_columns( + ["timestamp", "power_w", "total_vram", "free_vram"] + )["mem"] is None + + def test_detect_all_columns_excludes_memory_total(): """memory.total must not be picked as the memory column (we want USED memory).""" header = ["timestamp", "index", "power.draw [W]", "memory.total [MiB]", "memory.used [MiB]"]