SemiAnalysisAI · arygupt · May 27, 2026 · May 27, 2026 · May 28, 2026 · May 28, 2026
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -178,6 +178,13 @@ jobs:
               sleep 5
             done
           fi
+          # Drop root-owned leftovers from a prior (often cancelled) multinode
+          # run. The benchmark container runs as root and writes benchmark_logs/;
+          # if the job was cancelled its cleanup trap never ran, leaving
+          # root-owned dirs that actions/checkout (clean: true) can't rmdir
+          # (EACCES) — which then poison-fails EVERY subsequent job on that
+          # runner. Runs in both pre- and post-run cleanup (shared anchor).
+          sudo rm -rf "${GITHUB_WORKSPACE}/benchmark_logs" 2>/dev/null || true
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -41,10 +41,20 @@ start_gpu_monitor() {
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started NVIDIA (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
     elif command -v amd-smi &>/dev/null; then
-        # Use amd-smi native watch mode (-w) which includes timestamps automatically.
-        # Pipe through awk to: skip preamble lines, keep first CSV header, skip repeated headers.
-        amd-smi metric -p -c -t -u -w "$interval" --csv 2>/dev/null \
-            | awk '/^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
+        # amd-smi metric flags: -p power, -c clocks, -t temperature, -u usage
+        # (gfx_activity), -m mem-usage (used_vram), -w <interval> native watch
+        # mode (emits a timestamp column per sample), --csv. Without -m there is
+        # no VRAM column, so avg_mem_used_mb would never populate on AMD.
+        # The awk filter keeps the first CSV header line and drops
+        # amd-smi's preamble / repeated headers. Header match is case-insensitive
+        # (tolower) so a capitalized "Timestamp," header — should amd-smi ever
+        # emit one — still passes through; aggregate_power's column detection is
+        # case-insensitive too. NOTE: amd-smi timestamps are node-local wall
+        # clock, so multinode aggregation assumes cluster clocks are NTP-synced
+        # (same assumption as nvidia-smi; aggregate_power windows by absolute
+        # epoch from benchmark_serving.py).
+        amd-smi metric -p -c -t -u -m -w "$interval" --csv 2>/dev/null \
+            | awk 'tolower($0) ~ /^timestamp,/{if(!h){print;h=1};next} h{print}' > "$output" &
         GPU_MONITOR_PID=$!
         echo "[GPU Monitor] Started AMD (PID=$GPU_MONITOR_PID, interval=${interval}s, output=$output)"
     else
@@ -63,11 +73,75 @@ stop_gpu_monitor() {
             local lines
             lines=$(wc -l < "$GPU_METRICS_CSV")
             echo "[GPU Monitor] Collected $lines rows -> $GPU_METRICS_CSV"
+            # Echo the captured header so a vendor-SMI schema mismatch (the one
+            # thing that silently yields 0 usable power samples downstream) is
+            # visible in CI logs without re-running on hardware.
+            echo "[GPU Monitor] CSV header: $(head -1 "$GPU_METRICS_CSV" 2>/dev/null)"
         fi
     fi
     GPU_MONITOR_PID=""
 }
 
+# Start a per-node GPU power monitor for multi-node disaggregated runs.
+#
+# This is the AMD/SGLang/vLLM analogue of NVIDIA srt-slurm's per-node perfmon
+# (PR #35): there is no orchestrator to spawn nvidia-smi on each node, so each
+# node starts its own amd-smi/nvidia-smi monitor here. The output filename
+# encodes the worker role and index in exactly the format
+# utils/aggregate_power.py's _parse_perfmon_label expects:
+#
+#     perf_samples_<role>_w<worker_idx>_<host>.csv
+#
+# so the downstream aggregation can attribute energy per worker and (for disagg)
+# per stage. role must be one of: prefill, decode, agg, frontend.
+#
+# Output goes to $PERFMON_OUTPUT_DIR, which job.slurm points at the NFS-shared
+# /benchmark_logs/perfmon mount so every node's CSV lands in one directory the
+# runner can collect. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's benchmark
+# load window using the timestamps benchmark_serving.py writes.
+#
+# Best-effort by design: an unset output dir, an unknown role, or a missing
+# amd-smi/nvidia-smi is a no-op that returns 0 — a monitoring hiccup must never
+# fail the benchmark.
+#
+# Usage: start_perf_monitor <role> <worker_idx> [interval_seconds]
+start_perf_monitor() {
+    local role="$1"
+    local worker_idx="$2"
+    local interval="${3:-${PERFMON_SAMPLE_INTERVAL:-1}}"
+
+    local out_dir="${PERFMON_OUTPUT_DIR:-}"
+    if [[ -z "$out_dir" ]]; then
+        echo "[perfmon] PERFMON_OUTPUT_DIR unset — skipping per-node power monitor"
+        return 0
+    fi
+    case "$role" in
+        prefill|decode|agg|frontend) ;;
+        *)
+            echo "[perfmon] unknown role '$role' (expected prefill|decode|agg|frontend) — skipping monitor"
+            return 0
+            ;;
+    esac
+    if ! mkdir -p "$out_dir" 2>/dev/null; then
+        echo "[perfmon] cannot create $out_dir — skipping per-node power monitor"
+        return 0
+    fi
+
+    # Sanitize the host component so the filename stays parseable by
+    # aggregate_power's regex (role/idx anchors are unambiguous, but keep the
+    # host free of separators that could confuse a future tightening). Prefer
+    # the short hostname; fall back to the FQDN.
+    local host
+    host=$(hostname -s 2>/dev/null || hostname)
+    host=$(printf '%s' "$host" | tr -c 'A-Za-z0-9.-' '_')
+
+    local out="${out_dir}/perf_samples_${role}_w${worker_idx}_${host}.csv"
+    echo "[perfmon] starting per-node power monitor: role=$role worker=$worker_idx host=$host interval=${interval}s -> $out"
+    start_gpu_monitor --output "$out" --interval "$interval"
+    return 0
+}
+
 # Check if required environment variables are set
 # Usage: check_env_vars VAR1 VAR2 VAR3 ...
 # Exits with code 1 if any variable is not set

diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -298,6 +298,16 @@ export BENCHMARK_LOGS_DIR="${BENCHMARK_LOGS_DIR:-$(pwd)/benchmark_logs}"
 export KEEP_CONTAINERS="${KEEP_CONTAINERS:-0}"
 export ENGINE=$ENGINE
 
+# Per-node measured-power monitoring. Each node's server script starts an
+# amd-smi/nvidia-smi monitor (start_perf_monitor in benchmark_lib.sh) that
+# writes perf_samples_<role>_w<idx>_<host>.csv into PERFMON_OUTPUT_DIR. That
+# dir is the /benchmark_logs/perfmon mount, which maps to BENCHMARK_LOGS_DIR
+# on the (NFS-shared) host so every node's CSV lands in one place the runner
+# can collect. Pre-create it on the host so the directory exists before any
+# container writes to it.
+export PERFMON_SAMPLE_INTERVAL="${PERFMON_SAMPLE_INTERVAL:-1}"
+mkdir -p "${BENCHMARK_LOGS_DIR}/perfmon" 2>/dev/null || true
+
 # Eval-related env vars (threaded from submit.sh)
 export RUN_EVAL="${RUN_EVAL:-false}"
 export EVAL_ONLY="${EVAL_ONLY:-false}"
@@ -375,6 +385,8 @@ DOCKER_ENV_COMMON=(
     -e RUNNER_TYPE=\$RUNNER_TYPE
     -e RESULT_FILENAME=\$RESULT_FILENAME
     -e SPEC_DECODING=\$SPEC_DECODING
+    -e PERFMON_OUTPUT_DIR=/benchmark_logs/perfmon
+    -e PERFMON_SAMPLE_INTERVAL=\$PERFMON_SAMPLE_INTERVAL
     -e PREFILL_TP_SIZE=\$PREFILL_TP_SIZE
     -e PREFILL_ENABLE_EP=\$PREFILL_ENABLE_EP
     -e PREFILL_ENABLE_DP=\$PREFILL_ENABLE_DP

diff --git a/benchmarks/multi_node/amd_utils/server_sglang.sh b/benchmarks/multi_node/amd_utils/server_sglang.sh
@@ -48,6 +48,9 @@ GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
 # =============================================================================
 source $SGLANG_WS_PATH/setup_deps.sh
 source $SGLANG_WS_PATH/env.sh
+# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is
+# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up.
+source "$SGLANG_WS_PATH/../../benchmark_lib.sh"
 
 host_ip=$(ip route get 1.1.1.1 | awk '/src/ {print $7}')
 host_name=$(hostname)
@@ -279,6 +282,27 @@ done
 echo "Prefill worker headnode list: ${PREFILL_HEADNODE_URLS[@]}"
 echo "Decode  worker headnode list: ${DECODE_HEADNODE_URLS[@]}"
 
+# =============================================================================
+# Per-node measured-power monitor (best-effort)
+# =============================================================================
+# Classify this node into the same worker buckets the role branches below use:
+#   NODE_RANK in [0, NODE_OFFSET)  -> prefill, worker = NODE_RANK / PREFILL_NODES_PER_WORKER
+#   NODE_RANK >= NODE_OFFSET       -> decode,  worker = (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER
+# (NODE_OFFSET = PREFILL_NODES_PER_WORKER * xP.) Node 0 is the proxy too, but
+# its GPUs run the prefill head, so labeling it prefill attributes its energy
+# to the right stage. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's load window.
+if [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
+    PERF_ROLE="prefill"
+    PERF_WORKER_IDX=$(( NODE_RANK / PREFILL_NODES_PER_WORKER ))
+else
+    PERF_ROLE="decode"
+    PERF_WORKER_IDX=$(( (NODE_RANK - NODE_OFFSET) / DECODE_NODES_PER_WORKER ))
+fi
+if [[ "$DRY_RUN" -ne 1 ]]; then
+    start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX"
+fi
+
 # =============================================================================
 # Configuration Builder Functions
 # =============================================================================
@@ -636,6 +660,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
         echo "ERROR: eval failed; exiting node-0 with rc=1"
+        stop_gpu_monitor
         exit 1
     fi
 
@@ -777,5 +802,8 @@ else
 
 fi
 
+# Stop the per-node power monitor and flush its CSV before the container exits.
+stop_gpu_monitor
+
 echo "Script completed successfully"
 exit 0
diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -50,6 +50,9 @@ MODEL_PATH="${MODEL_PATH:-${MODEL_DIR}/${MODEL_NAME}}"
 # Dependencies and Environment Setup
 # =============================================================================
 source $WS_PATH/env.sh
+# Power-monitoring helpers (start_perf_monitor / stop_gpu_monitor). WS_PATH is
+# .../benchmarks/multi_node/amd_utils, so the shared lib is two levels up.
+source "$WS_PATH/../../benchmark_lib.sh"
 
 host_ip=$(ip route get 1.1.1.1 2>/dev/null | awk '/src/ {print $7}')
 # RDMA IP for Nixl KV transfer (prefer 192.168.x.x subnet if available)
@@ -214,6 +217,25 @@ done
 echo "Prefill node IPs: ${PREFILL_ARGS}"
 echo "Decode  node IPs: ${DECODE_ARGS}"
 
+# =============================================================================
+# Per-node measured-power monitor (best-effort)
+# =============================================================================
+# vLLM places one worker per node: ranks [0, xP) are prefill (kv_producer),
+# ranks [xP, xP+yD) are decode (kv_consumer) — see the role branches below.
+# Node 0 is the proxy too, but its GPUs run the first prefill worker, so it is
+# correctly labeled prefill. The monitor runs for the whole server lifetime;
+# aggregate_power.py windows the samples down to each concurrency's load window.
+if [ "$NODE_RANK" -lt "$xP" ]; then
+    PERF_ROLE="prefill"
+    PERF_WORKER_IDX=$NODE_RANK
+else
+    PERF_ROLE="decode"
+    PERF_WORKER_IDX=$(( NODE_RANK - xP ))
+fi
+if [[ "$DRY_RUN" -ne 1 ]]; then
+    start_perf_monitor "$PERF_ROLE" "$PERF_WORKER_IDX"
+fi
+
 # MoRI-IO proxy ZMQ registration port (must match vllm-router --vllm-discovery-address)
 PROXY_PING_PORT="${PROXY_PING_PORT:-36367}"
 
@@ -408,6 +430,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
 
     if [[ "${EVAL_FAILED:-0}" -eq 1 ]]; then
         echo "ERROR: eval failed; exiting node-0 with rc=1"
+        stop_gpu_monitor
         exit 1
     fi
 
@@ -523,5 +546,8 @@ fi
 # kill $etcd_pid 2>/dev/null || true
 # pkill -f etcd 2>/dev/null || true
 
+# Stop the per-node power monitor and flush its CSV before the container exits.
+stop_gpu_monitor
+
 echo "Script completed successfully"
 exit 0
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3193,6 +3193,27 @@
     - "Add multinode launch script glm5_fp8_mi355x_sglang-disagg.sh; server.sh sources setup_deps.sh"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1572
 
+- config-keys:
+    - dsv4-fp4-gb300-dynamo-sglang
+  description:
+    - "Smoke run validating multinode measured-power aggregation (PR #1574). No config change; entry exists to trigger a sweep that produces the first multinode agg JSON with avg_power_w + joules_per_*_token populated from per-node srt-slurm perfmon CSVs. Validates per-source GPU-id namespacing in aggregate_power.py (without it, 14 nodes × 4 GPUs would report num_gpus=4 instead of 56) and the GPU_METRICS_CSV_GLOB env var bridge in process_result.py. Only the gb300-cw runner has the perfmon launcher changes; any gb300-nv runs in the sweep will succeed normally without power fields, which the dashboard handles gracefully (chart gates on field presence)."
+    - "Re-run after launcher recurse-glob fix (6da2f1b6) — prior sweep (#26548110246) completed green at the workflow level but produced 0 measured-power rows because the flat *.yaml glob in the monitoring-injection loop matched zero recipes (recipes live in 8k1k/ subdir). Fix uses `find -type f -name '*.yaml'`. Also re-pointed SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon onto current NVIDIA/srt-slurm main so the launcher's `default_bash_preamble:` srtslurm.yaml field is accepted by srtctl schema."
+    - "Re-run after per-worker aggregation (24f46ffe) — validates new agg JSON fields: workers[] with role labels (prefill/decode/agg/frontend) parsed from srt-slurm perfmon CSV filenames, plus per-stage scalars (prefill_avg_power_w, decode_avg_power_w, joules_per_input_token = prefill_energy / input_tokens, joules_per_output_token_decode = decode_energy / output_tokens). joules_per_output_token and joules_per_total_token stay cluster-wide on all topologies so the metric is comparable across single-node, multinode-agg, and multinode-disagg. Per-stage scalars emitted only for disagg runs with both prefill and decode workers present. workers[] entries also carry per-worker avg_temp_c/peak_temp_c/avg_util_pct/avg_mem_used_mb when the CSV exposes those columns."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574
+
+- config-keys:
+    - qwen3.5-fp8-mi355x-sglang-disagg
+    - glm5-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg
+    - kimik2.5-fp4-mi355x-vllm-disagg
+    - minimaxm2.5-fp8-mi355x-vllm-disagg
+  description:
+    - "Smoke run validating AMD multinode measured-power aggregation — the AMD analogue of the NVIDIA gb300/srt-slurm path (PR #1574). No config change; entry exists to trigger a sweep that produces the first AMD multinode agg JSONs with avg_power_w + joules_per_*_token + per-worker workers[] populated from per-node amd-smi perfmon CSVs."
+    - "The AMD amd_utils SLURM job has no orchestrator perfmon, so each SGLang/vLLM disagg node starts its own amd-smi monitor via start_perf_monitor (benchmarks/benchmark_lib.sh), writing perf_samples_<role>_w<idx>_<host>.csv into the NFS-shared /benchmark_logs/perfmon mount (wired in amd_utils/job.slurm). launch_mi355x-amds.sh collects the per-node CSVs into the GH workspace before the EXIT trap wipes the logs dir and sets GPU_METRICS_CSV_GLOB so the existing Process-result step runs the same vendor-agnostic utils/aggregate_power.py used for NVIDIA: per-source GPU-id namespacing (8 GPUs/node on MI355X, so a TP16 worker over 2 nodes counts 16 GPUs not 8), per-stage prefill/decode energy attribution, and per-worker temp/util/mem when amd-smi exposes those columns."
+    - "Covers both engine paths: SGLang disagg (server_sglang.sh role = NODE_RANK bucketed by PREFILL_NODES_PER_WORKER / NODE_OFFSET) and vLLM disagg (server_vllm.sh one worker per node, ranks [0,xP) prefill / [xP,xP+yD) decode). Monitoring is best-effort end-to-end — a missing amd-smi or empty CSV skips power patching without failing the benchmark upload; DISAGG=true threads through to per-stage attribution while agg/non-disagg runs still get cluster-wide power."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1574
+
 - config-keys:
     - qwen3.5-fp4-mi355x-sglang-disagg
   description: