From db8c8d39ee03efd41e367acc68027ef17ba00361 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 13 Mar 2026 14:28:05 -0400 Subject: [PATCH 1/4] more succinct benchmark summaries Signed-off-by: Will Manning --- scripts/compare-benchmark-jsons.py | 66 +++++++++++------------------- 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 0c12fe77acb..01e06438692 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -536,11 +536,29 @@ def main() -> None: else format_performance(geo_mean_ratio, improvement_threshold, regression_threshold, "overall") ) - summary_lines = [ - "## Summary", - "", - f"- **Overall**: {overall_performance}", - ] + statistical_analysis = build_statistical_analysis(df3, threshold_pct) + verdict = build_verdict(statistical_analysis) if statistical_analysis is not None else None + + heading = f"## Benchmarks: {benchmark_name}" if benchmark_name else "## Benchmarks" + summary_lines = [heading, ""] + + if verdict is not None: + summary_lines.append(f"**Summary**: {verdict['status']}
") + summary_lines.append(f"**Confidence**: {verdict['confidence']}
") + summary_lines.append(f"**Attributed Vortex impact**: {verdict['impact']}
") + summary_lines.append(f"**Environment shift**: {verdict['environment_shift']}
") + + if statistical_analysis is not None: + residual_noise = format_ratio_change(statistical_analysis["residual_noise_ratio"]) + summary_lines.append(f"**Residual noise**: {residual_noise}
") + + polish = statistical_analysis["median_polish"] + if polish is not None: + summary_lines.append( + f"**Median polish overall**: {format_ratio_change(float(np.exp(polish.overall)))}
" + ) + + summary_lines.append(f"**Overall**: {overall_performance}
") if len(vortex_df) > 0: vortex_performance = format_performance( vortex_geo_mean_ratio, @@ -548,7 +566,7 @@ def main() -> None: regression_threshold, "vortex", ) - summary_lines.append(f"- **Vortex**: {vortex_performance}") + summary_lines.append(f"**Vortex**: {vortex_performance}
") if len(parquet_df) > 0: parquet_performance = format_performance( parquet_geo_mean_ratio, @@ -556,41 +574,7 @@ def main() -> None: regression_threshold, "parquet", ) - summary_lines.append(f"- **Parquet**: {parquet_performance}") - - statistical_analysis = build_statistical_analysis(df3, threshold_pct) - verdict = build_verdict(statistical_analysis) if statistical_analysis is not None else None - if verdict is not None: - summary_lines.extend( - [ - "", - "## Verdict", - "", - f"**{verdict['status']}**", - f"- **Attributed Vortex impact**: {verdict['impact']}", - f"- **Confidence**: {verdict['confidence']}", - f"- **Environment shift**: {verdict['environment_shift']}", - ] - ) - - if statistical_analysis is not None: - systemic_shift = format_ratio_change(statistical_analysis["systemic_shift_ratio"]) - control_sigma = format_ratio_change(float(np.exp(statistical_analysis["systemic_shift_std"]))) - residual_noise = format_ratio_change(statistical_analysis["residual_noise_ratio"]) - summary_lines.extend( - [ - "", - "## Statistical Summary", - "", - f"- **Systemic shift ({CONTROL_FORMAT} controls)**: {systemic_shift}", - f"- **Control sigma**: {control_sigma}", - f"- **Residual noise**: {residual_noise}", - ] - ) - - polish = statistical_analysis["median_polish"] - if polish is not None: - summary_lines.append(f"- **Median polish overall**: {format_ratio_change(float(np.exp(polish.overall)))}") + summary_lines.append(f"**Parquet**: {parquet_performance}") print("\n".join(summary_lines)) print("") From 8095fdd92a75cba62ad6d32cc38a610b1a7989b2 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 13 Mar 2026 14:46:02 -0400 Subject: [PATCH 2/4] less vertical Signed-off-by: Will Manning --- scripts/compare-benchmark-jsons.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 01e06438692..ddb4f3eefd0 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -539,26 +539,25 @@ def main() -> None: statistical_analysis = build_statistical_analysis(df3, threshold_pct) verdict = build_verdict(statistical_analysis) if statistical_analysis is not None else None - heading = f"## Benchmarks: {benchmark_name}" if benchmark_name else "## Benchmarks" - summary_lines = [heading, ""] + summary_fields: list[str] = [] if verdict is not None: - summary_lines.append(f"**Summary**: {verdict['status']}
") - summary_lines.append(f"**Confidence**: {verdict['confidence']}
") - summary_lines.append(f"**Attributed Vortex impact**: {verdict['impact']}
") - summary_lines.append(f"**Environment shift**: {verdict['environment_shift']}
") + summary_fields.append(f"**Summary**: {verdict['status']}") + summary_fields.append(f"**Confidence**: {verdict['confidence']}") + summary_fields.append(f"**Attributed Vortex impact**: {verdict['impact']}") + summary_fields.append(f"**Environment shift**: {verdict['environment_shift']}") if statistical_analysis is not None: residual_noise = format_ratio_change(statistical_analysis["residual_noise_ratio"]) - summary_lines.append(f"**Residual noise**: {residual_noise}
") + summary_fields.append(f"**Residual noise**: {residual_noise}") polish = statistical_analysis["median_polish"] if polish is not None: - summary_lines.append( - f"**Median polish overall**: {format_ratio_change(float(np.exp(polish.overall)))}
" + summary_fields.append( + f"**Median polish overall**: {format_ratio_change(float(np.exp(polish.overall)))}" ) - summary_lines.append(f"**Overall**: {overall_performance}
") + summary_fields.append(f"**Overall**: {overall_performance}") if len(vortex_df) > 0: vortex_performance = format_performance( vortex_geo_mean_ratio, @@ -566,7 +565,7 @@ def main() -> None: regression_threshold, "vortex", ) - summary_lines.append(f"**Vortex**: {vortex_performance}
") + summary_fields.append(f"**Vortex**: {vortex_performance}") if len(parquet_df) > 0: parquet_performance = format_performance( parquet_geo_mean_ratio, @@ -574,9 +573,11 @@ def main() -> None: regression_threshold, "parquet", ) - summary_lines.append(f"**Parquet**: {parquet_performance}") + summary_fields.append(f"**Parquet**: {parquet_performance}") - print("\n".join(summary_lines)) + print("
".join(summary_fields)) + print("") + print("---") print("") if statistical_analysis is not None: From 8a7b9d1726ecc1d1c18a4d4fcfb03936cc45bf53 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 13 Mar 2026 15:04:12 -0400 Subject: [PATCH 3/4] moar Signed-off-by: Will Manning --- scripts/compare-benchmark-jsons.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index ddb4f3eefd0..0ed4d03fb68 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -542,20 +542,9 @@ def main() -> None: summary_fields: list[str] = [] if verdict is not None: - summary_fields.append(f"**Summary**: {verdict['status']}") - summary_fields.append(f"**Confidence**: {verdict['confidence']}") + summary_fields.append(f"**Verdict**: {verdict['status']}") summary_fields.append(f"**Attributed Vortex impact**: {verdict['impact']}") - summary_fields.append(f"**Environment shift**: {verdict['environment_shift']}") - - if statistical_analysis is not None: - residual_noise = format_ratio_change(statistical_analysis["residual_noise_ratio"]) - summary_fields.append(f"**Residual noise**: {residual_noise}") - - polish = statistical_analysis["median_polish"] - if polish is not None: - summary_fields.append( - f"**Median polish overall**: {format_ratio_change(float(np.exp(polish.overall)))}" - ) + summary_fields.append(f"**Confidence**: {verdict['confidence']}") summary_fields.append(f"**Overall**: {overall_performance}") if len(vortex_df) > 0: @@ -575,6 +564,16 @@ def main() -> None: ) summary_fields.append(f"**Parquet**: {parquet_performance}") + if verdict is not None: + summary_fields.append(f"**Parquet shift (control)**: {verdict['environment_shift']}") + + if statistical_analysis is not None: + polish = statistical_analysis["median_polish"] + if polish is not None: + summary_fields.append( + f"**Overall shift (robust)**: {format_ratio_change(float(np.exp(polish.overall)))}" + ) + print("
".join(summary_fields)) print("") print("---") From 0c882e63c37f47745b37514ad741137ab43c4e65 Mon Sep 17 00:00:00 2001 From: Will Manning Date: Fri, 13 Mar 2026 15:47:51 -0400 Subject: [PATCH 4/4] moar Signed-off-by: Will Manning --- scripts/compare-benchmark-jsons.py | 41 +++++------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/scripts/compare-benchmark-jsons.py b/scripts/compare-benchmark-jsons.py index 0ed4d03fb68..44514053fad 100644 --- a/scripts/compare-benchmark-jsons.py +++ b/scripts/compare-benchmark-jsons.py @@ -143,19 +143,6 @@ def ratio_stats( } -def robust_scale(values: pd.Series | np.ndarray) -> float: - """Estimate spread with MAD so outliers do not dominate the noise estimate.""" - - array = np.asarray(values, dtype=float) - array = array[np.isfinite(array)] - if array.size == 0: - return float("nan") - - median = np.median(array) - mad = np.median(np.abs(array - median)) - return float(1.4826 * mad) - - def median_polish(table: pd.DataFrame, max_iterations: int = 10, tolerance: float = 1e-8) -> MedianPolishResult | None: """Estimate row and column effects for the log-ratio matrix.""" @@ -309,10 +296,9 @@ def build_statistical_analysis(df: pd.DataFrame, threshold_pct: int) -> dict[str axis=1, ) - # Median polish gives a robust overall shift plus residual-noise estimate. + # Median polish gives a robust overall shift estimate. log_ratio_table = detail_df.pivot(index="query", columns="combo", values="log_ratio") polish = median_polish(log_ratio_table) - residual_noise_log_scale = robust_scale(polish.residuals.to_numpy().ravel()) if polish is not None else float("nan") return { "detail_df": detail_df, @@ -320,9 +306,6 @@ def build_statistical_analysis(df: pd.DataFrame, threshold_pct: int) -> dict[str "systemic_shift_ratio": float(np.exp(systemic_shift_log_ratio)), "systemic_shift_std": systemic_shift_std, "median_polish": polish, - "residual_noise_ratio": float(np.exp(residual_noise_log_scale)) - if np.isfinite(residual_noise_log_scale) - else float("nan"), } @@ -527,14 +510,8 @@ def main() -> None: vortex_df = df3[df3["name"].str.contains("vortex", case=False, na=False)] parquet_df = df3[df3["name"].str.contains("parquet", case=False, na=False)] - geo_mean_ratio = calculate_geo_mean(df3) vortex_geo_mean_ratio = calculate_geo_mean(vortex_df) parquet_geo_mean_ratio = calculate_geo_mean(parquet_df) - overall_performance = ( - "no data" - if pd.isna(geo_mean_ratio) - else format_performance(geo_mean_ratio, improvement_threshold, regression_threshold, "overall") - ) statistical_analysis = build_statistical_analysis(df3, threshold_pct) verdict = build_verdict(statistical_analysis) if statistical_analysis is not None else None @@ -542,11 +519,9 @@ def main() -> None: summary_fields: list[str] = [] if verdict is not None: - summary_fields.append(f"**Verdict**: {verdict['status']}") + summary_fields.append(f"**Verdict**: {verdict['status']} ({verdict['confidence']} confidence)") summary_fields.append(f"**Attributed Vortex impact**: {verdict['impact']}") - summary_fields.append(f"**Confidence**: {verdict['confidence']}") - summary_fields.append(f"**Overall**: {overall_performance}") if len(vortex_df) > 0: vortex_performance = format_performance( vortex_geo_mean_ratio, @@ -554,7 +529,7 @@ def main() -> None: regression_threshold, "vortex", ) - summary_fields.append(f"**Vortex**: {vortex_performance}") + summary_fields.append(f"**Vortex (geomean)**: {vortex_performance}") if len(parquet_df) > 0: parquet_performance = format_performance( parquet_geo_mean_ratio, @@ -562,17 +537,15 @@ def main() -> None: regression_threshold, "parquet", ) - summary_fields.append(f"**Parquet**: {parquet_performance}") + summary_fields.append(f"**Parquet (geomean)**: {parquet_performance}") if verdict is not None: - summary_fields.append(f"**Parquet shift (control)**: {verdict['environment_shift']}") - + shifts = f"Parquet (control) {verdict['environment_shift']}" if statistical_analysis is not None: polish = statistical_analysis["median_polish"] if polish is not None: - summary_fields.append( - f"**Overall shift (robust)**: {format_ratio_change(float(np.exp(polish.overall)))}" - ) + shifts += f" ยท Median polish {format_ratio_change(float(np.exp(polish.overall)))}" + summary_fields.append(f"**Shifts**: {shifts}") print("
".join(summary_fields)) print("")