From 992b90f93782403bab81cab66e071cab4be4a741 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 29 May 2026 15:59:01 -0700 Subject: [PATCH] ci(disagg): fail before writing result file + surface real failure class Two independent disagg-CI hardening fixes (both verified drop-in): A) benchmark_serving.py: move the request-failure-rate gate ABOVE the `if args.save_result:` block. Previously the result JSON was written (json.dump + pytorch sidecar) and THEN the gate raised SystemExit, so a zero-/sub-threshold run left a schema-valid JSON on disk. Downstream collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml) key on file EXISTENCE, not exit code, so a written-then-failed file looks successful. Gating first keeps disk state consistent with the exit code. Complements PR #1590's process_result zero-guard (that one is downstream; this stops the misleading file at the source). B) launch_mi355x-amds.sh cleanup_and_save_logs: after tailing the slurm .err, classify it and emit a single GitHub ::error:: annotation naming the failure class (recipe model-not-found / readiness-timeout / fp8_blockwise IntraNode combine / MoRI transport / segfault), with a generic fallback. Without this the Actions UI shows only "No benchmark result files found". Deterministic recipe errors are checked first so a config bug is never mislabeled as a transport flake. Guarded on GITHUB_ACTIONS. Held for a follow-up: bounded flake-only retry + node-quarantine in benchmark-multinode-tmpl.yml (needs the submit.sh node-allocation path traced to thread a --exclude, and offline classifier validation against canned .err fixtures). Co-Authored-By: Claude Opus 4.8 --- runners/launch_mi355x-amds.sh | 25 ++++++++++++++++++++++++ utils/bench_serving/benchmark_serving.py | 25 ++++++++++++++---------- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 9b6cb96a9..52a835599 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -67,6 +67,31 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "=== Slurm job stderr ===" tail -100 "$err_file" echo "========================" + # Surface the real failure class in the Actions UI. Without this, a + # launch failure shows only the generic "No benchmark result files + # found" from benchmark-multinode-tmpl.yml. Order matters: check the + # deterministic recipe error (model-not-found, #1581) before the + # transport-flake patterns (#1584 MoRI/readiness) so a config bug is + # never mislabeled as a flake. + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + local sig="" + if grep -qiE "Model '.*' not found|FATAL: Model|model .* not found" "$err_file"; then + sig="recipe-error: model not found (deterministic - check MODEL/MODEL_PATH, not MoRI)" + elif grep -qiE "ReadTimeout|readiness.*timeout|warmup.*time(d)? ?out|health.*timeout" "$err_file"; then + sig="transport-flake: readiness/warmup timeout (MoRI pd-disagg)" + elif grep -qiE "Fp8BlockwiseQuant.*IntraNode|dispatch_combine|combine.*IntraNode" "$err_file"; then + sig="config-error: MoRI fp8_blockwise combine needs IntraNode (disable TBO/SDMA on FP4 prefill, #1584)" + elif grep -qiE "MoRI|mori_conn|pd[- ]?disagg" "$err_file"; then + sig="transport-flake: MoRI KV-transport error" + elif grep -qiE "segfault|Segmentation fault|signal 11|core dumped|gpucore" "$err_file"; then + sig="transport-flake: server segfault / core dump" + fi + if [[ -n "$sig" ]]; then + echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::${sig} (see slurm .err artifact)" + else + echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::Unclassified failure - see last 100 lines of slurm .err above" + fi + fi fi sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true } diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 741e44236..32c06e078 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -879,6 +879,21 @@ def main(args: argparse.Namespace): lora_modules=args.lora_modules, )) + # Gate the run BEFORE writing any result file. A sub-threshold (or + # zero-completion) run must not leave a schema-valid JSON on disk: + # downstream collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml) + # treat file *existence* as success, so a written-then-failed file looks + # successful. Raising here keeps disk state consistent with the exit code. + max_failure_rate = 0.05 + completed = benchmark_result["completed"] + failure_rate = 1 - completed / args.num_prompts + if failure_rate > max_failure_rate: + raise SystemExit( + f"FAIL: request failure rate {failure_rate:.1%} exceeds " + f"{max_failure_rate:.0%} threshold " + f"({completed}/{args.num_prompts} completed)" + ) + # Save config and results to json if args.save_result: result_json: Dict[str, Any] = {} @@ -940,16 +955,6 @@ def main(args: argparse.Namespace): json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) - max_failure_rate = 0.05 - completed = benchmark_result["completed"] - failure_rate = 1 - completed / args.num_prompts - if failure_rate > max_failure_rate: - raise SystemExit( - f"FAIL: request failure rate {failure_rate:.1%} exceeds " - f"{max_failure_rate:.0%} threshold " - f"({completed}/{args.num_prompts} completed)" - ) - if __name__ == "__main__": parser = FlexibleArgumentParser(