From bd405c142bc9d1c7d8eb4877e8a82ff35bc60852 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 29 May 2026 15:27:51 -0700 Subject: [PATCH] fix(process_result): fail loudly on zero-throughput runs instead of ZeroDivisionError A failed/degenerate benchmark (disagg warmup deadlock, MoRI transport failure, server never reaching ready) still writes a result JSON, but with zeroed throughput/latency. The tpot reciprocal `1000.0 / float(value)` then raises an opaque "ZeroDivisionError: float division by zero", masking the real server-side cause and making every such failure look identical. This is the dominant red signature on mi355x-disagg sweeps (e.g. #1584's DEP8/MTP jobs). Guard up front: if output/total throughput is 0, exit with a clear, actionable message pointing at the multinode_server_logs artifact. Co-Authored-By: Claude Opus 4.8 --- utils/process_result.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/process_result.py b/utils/process_result.py index 5fb059473..8574164c3 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -42,6 +42,23 @@ def get_required_env_vars(required_vars): with open(f'{result_filename}.json') as f: bmk_result = json.load(f) +# A failed/degenerate benchmark (server never came up, disagg warmup deadlock, +# MoRI/KV-transport failure, etc.) still writes a result JSON, but with zeroed +# throughput and latency metrics. Detect that here and fail with the real, +# actionable reason. Otherwise the tpot reciprocal below (1000.0 / tpot) raises +# an opaque "ZeroDivisionError: float division by zero" that masks the true +# server-side cause and makes every such failure look identical. +_output_tput = float(bmk_result.get('output_throughput', 0) or 0) +_total_tput = float(bmk_result.get('total_token_throughput', 0) or 0) +if _output_tput <= 0 or _total_tput <= 0: + raise SystemExit( + "FAIL: benchmark produced no decode throughput " + f"(output_throughput={_output_tput}, total_token_throughput={_total_tput}) " + f"in {result_filename}.json — the server almost certainly failed to serve " + "(disagg warmup deadlock / MoRI transport failure / server never reached " + "ready). Check the multinode_server_logs artifact for the real error." + ) + data = { 'hw': hw, 'conc': int(bmk_result['max_concurrency']),