diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 9b6cb96a9..52a835599 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -67,6 +67,31 @@ if [[ "$IS_MULTINODE" == "true" ]]; then echo "=== Slurm job stderr ===" tail -100 "$err_file" echo "========================" + # Surface the real failure class in the Actions UI. Without this, a + # launch failure shows only the generic "No benchmark result files + # found" from benchmark-multinode-tmpl.yml. Order matters: check the + # deterministic recipe error (model-not-found, #1581) before the + # transport-flake patterns (#1584 MoRI/readiness) so a config bug is + # never mislabeled as a flake. + if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + local sig="" + if grep -qiE "Model '.*' not found|FATAL: Model|model .* not found" "$err_file"; then + sig="recipe-error: model not found (deterministic - check MODEL/MODEL_PATH, not MoRI)" + elif grep -qiE "ReadTimeout|readiness.*timeout|warmup.*time(d)? ?out|health.*timeout" "$err_file"; then + sig="transport-flake: readiness/warmup timeout (MoRI pd-disagg)" + elif grep -qiE "Fp8BlockwiseQuant.*IntraNode|dispatch_combine|combine.*IntraNode" "$err_file"; then + sig="config-error: MoRI fp8_blockwise combine needs IntraNode (disable TBO/SDMA on FP4 prefill, #1584)" + elif grep -qiE "MoRI|mori_conn|pd[- ]?disagg" "$err_file"; then + sig="transport-flake: MoRI KV-transport error" + elif grep -qiE "segfault|Segmentation fault|signal 11|core dumped|gpucore" "$err_file"; then + sig="transport-flake: server segfault / core dump" + fi + if [[ -n "$sig" ]]; then + echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::${sig} (see slurm .err artifact)" + else + echo "::error title=AMD disagg job ${JOB_ID:-unknown} failed::Unclassified failure - see last 100 lines of slurm .err above" + fi + fi fi sudo rm -rf "$BENCHMARK_LOGS_DIR" 2>/dev/null || true } diff --git a/utils/bench_serving/benchmark_serving.py b/utils/bench_serving/benchmark_serving.py index 741e44236..32c06e078 100644 --- a/utils/bench_serving/benchmark_serving.py +++ b/utils/bench_serving/benchmark_serving.py @@ -879,6 +879,21 @@ def main(args: argparse.Namespace): lora_modules=args.lora_modules, )) + # Gate the run BEFORE writing any result file. A sub-threshold (or + # zero-completion) run must not leave a schema-valid JSON on disk: + # downstream collectors (launch_mi355x-amds.sh, benchmark-multinode-tmpl.yml) + # treat file *existence* as success, so a written-then-failed file looks + # successful. Raising here keeps disk state consistent with the exit code. + max_failure_rate = 0.05 + completed = benchmark_result["completed"] + failure_rate = 1 - completed / args.num_prompts + if failure_rate > max_failure_rate: + raise SystemExit( + f"FAIL: request failure rate {failure_rate:.1%} exceeds " + f"{max_failure_rate:.0%} threshold " + f"({completed}/{args.num_prompts} completed)" + ) + # Save config and results to json if args.save_result: result_json: Dict[str, Any] = {} @@ -940,16 +955,6 @@ def main(args: argparse.Namespace): json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) - max_failure_rate = 0.05 - completed = benchmark_result["completed"] - failure_rate = 1 - completed / args.num_prompts - if failure_rate > max_failure_rate: - raise SystemExit( - f"FAIL: request failure rate {failure_rate:.1%} exceeds " - f"{max_failure_rate:.0%} threshold " - f"({completed}/{args.num_prompts} completed)" - ) - if __name__ == "__main__": parser = FlexibleArgumentParser(