From 992b90f93782403bab81cab66e071cab4be4a741 Mon Sep 17 00:00:00 2001
From: Aryan <aryaman@semianalysis.com>
Date: Fri, 29 May 2026 15:59:01 -0700
Subject: [PATCH] ci(disagg): fail before writing result file + surface real
 failure class

Two independent disagg-CI hardening fixes (both verified drop-in):

A) benchmark_serving.py: move the request-failure-rate gate ABOVE the
   `if args.save_result:` block. Previously the result JSON was written
   (json.dump + pytorch sidecar) and THEN the gate raised SystemExit, so a
   zero-/sub-threshold run left a schema-valid JSON on disk. Downstream
   collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml) key on
   file EXISTENCE, not exit code, so a written-then-failed file looks
   successful. Gating first keeps disk state consistent with the exit code.
   Complements PR #1590's process_result zero-guard (that one is downstream;
   this stops the misleading file at the source).

B) launch_mi355x-amds.sh cleanup_and_save_logs: after tailing the slurm .err,
   classify it and emit a single GitHub ::error:: annotation naming the
   failure class (recipe model-not-found / readiness-timeout / fp8_blockwise
   IntraNode combine / MoRI transport / segfault), with a generic fallback.
   Without this the Actions UI shows only "No benchmark result files found".
   Deterministic recipe errors are checked first so a config bug is never
   mislabeled as a transport flake. Guarded on GITHUB_ACTIONS.

Held for a follow-up: bounded flake-only retry + node-quarantine in
benchmark-multinode-tmpl.yml (needs the submit.sh node-allocation path traced
to thread a --exclude, and offline classifier validation against canned
.err fixtures).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 runners/launch_mi355x-amds.sh            | 25 ++++++++++++++++++++++++
 utils/bench_serving/benchmark_serving.py | 25 ++++++++++++++----------
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 9b6cb96a9..52a835599 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -67,6 +67,31 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
             echo "=== Slurm job stderr ==="
             tail -100 "$err_file"
             echo "========================"
+            # Surface the real failure class in the Actions UI. Without this, a
+            # launch failure shows only the generic "No benchmark result files
+            # found" from benchmark-multinode-tmpl.yml. Order matters: check the
+            # deterministic recipe error (model-not-found, #1581) before the
+            # transport-flake patterns (#1584 MoRI/readiness) so a config bug is
+            # never mislabeled as a flake.
+            if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+                local sig=""
+                if   grep -qiE "Model '.*' not found|FATAL: Model|model .* not found" "$err_file"; then
+                    sig="recipe-error: model not found (deterministic - check MODEL/MODEL_PATH, not MoRI)"
+                elif grep -qiE "ReadTimeout|readiness.*timeout|warmup.*time(d)? ?out|health.*timeout" "$err_file"; then
+                    sig="transport-flake: readiness/warmup timeout (MoRI pd-disagg)"
+                elif grep -qiE "Fp8BlockwiseQuant.*IntraNode|dispatch_combine|combine.*IntraNode" "$err_file"; then
+                    sig="config-error: MoRI fp8_blockwise combine needs IntraNode (disable TBO/SDMA on FP4 prefill, #1584)"
+                elif grep -qiE "MoRI|mori_conn|pd[- ]?disagg" "$err_file"; then
+                    sig="transport-flake: MoRI KV-transport error"
+                elif grep -qiE "segfault|Segmentation fault|signal 11|core dumped|gpucore" "$err_file"; then
+                    sig="transport-flake: server segfault / core dump"
+                fi
+                if [[ -n "$sig" ]]; then
+                    echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::${sig} (see slurm .err artifact)"
+                else
+                    echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::Unclassified failure - see last 100 lines of slurm .err above"
+                fi
+            fi
         fi
         sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true
     }
diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py
index 741e44236..32c06e078 100644
--- a/utils/bench_serving/benchmark_serving.py
+++ b/utils/bench_serving/benchmark_serving.py
@@ -879,6 +879,21 @@ def main(args: argparse.Namespace):
             lora_modules=args.lora_modules,
         ))
 
+    # Gate the run BEFORE writing any result file. A sub-threshold (or
+    # zero-completion) run must not leave a schema-valid JSON on disk:
+    # downstream collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml)
+    # treat file *existence* as success, so a written-then-failed file looks
+    # successful. Raising here keeps disk state consistent with the exit code.
+    max_failure_rate = 0.05
+    completed = benchmark_result["completed"]
+    failure_rate = 1 - completed / args.num_prompts
+    if failure_rate > max_failure_rate:
+        raise SystemExit(
+            f"FAIL: request failure rate {failure_rate:.1%} exceeds "
+            f"{max_failure_rate:.0%} threshold "
+            f"({completed}/{args.num_prompts} completed)"
+        )
+
     # Save config and results to json
     if args.save_result:
         result_json: Dict[str, Any] = {}
@@ -940,16 +955,6 @@ def main(args: argparse.Namespace):
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
-    max_failure_rate = 0.05
-    completed = benchmark_result["completed"]
-    failure_rate = 1 - completed / args.num_prompts
-    if failure_rate > max_failure_rate:
-        raise SystemExit(
-            f"FAIL: request failure rate {failure_rate:.1%} exceeds "
-            f"{max_failure_rate:.0%} threshold "
-            f"({completed}/{args.num_prompts} completed)"
-        )
-
 
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(