@@ -195,11 +195,20 @@ int main() {
195195 return 0 ;
196196 }
197197
198- // Project convention: 11 outer rounds, 1M inner iterations
199- // (TASK-058 acceptance criterion). serialize_allow_405 is more
200- // expensive than the other measurements, so 100K inner is enough
201- // to keep the wall time bounded.
202- constexpr std::size_t OUTER = 11 ;
198+ // OUTER=51 matches bench_hook_overhead and bench_route_lookup: with 51
199+ // rounds the median is the 26th sorted value (exact midpoint), so 1-2
200+ // high outlier rounds from OS scheduling noise cannot shift the median.
201+ // With OUTER=11 (the old value) the median was the 6th value and a
202+ // single outlier round could flip a <10 ns gate; on a shared CI runner
203+ // this produced spurious failures independent of real regressions.
204+ // Wall-clock cost: 51 rounds * 1M inner at ~12 ns/call ≈ 612 ms total,
205+ // acceptable for an out-of-band `make bench` run. (test-quality-reviewer-iter1-1)
206+ //
207+ // INNER_405=100K: serialize_allow_405 is more expensive than the other
208+ // measurements (~40 ns), so 100K inner (4 ms of signal per outer round)
209+ // keeps wall time bounded. The 51-outer median is still stable at this
210+ // inner count because we care about median ns/call, not absolute signal.
211+ constexpr std::size_t OUTER = 51 ;
203212 constexpr std::size_t INNER = 1'000'000 ;
204213 constexpr std::size_t INNER_405 = 100'000 ;
205214
@@ -306,14 +315,24 @@ int main() {
306315 // allocations stay inside the arena and zero global-heap allocation
307316 // occurs during the measured window.
308317 //
309- // Prior design flaw: the old loop accumulated all 1M values under a
310- // single key without resetting the arena. The arena was exhausted
311- // after ~819 iterations (~65536 / 80 bytes per pmr::string), so
312- // every subsequent call spilled to the upstream heap -- measuring
318+ // Prior design flaw (TASK-083) : the old loop accumulated all 1M values
319+ // under a single key without resetting the arena. The arena was
320+ // exhausted after ~819 iterations (~65536 / 80 bytes per pmr::string),
321+ // so every subsequent call spilled to the upstream heap -- measuring
313322 // exactly the allocation overhead the task was supposed to eliminate.
314- // The new design (fresh impl per call) removes this flaw and makes
315- // the bench an honest proof of the zero-heap-alloc guarantee.
316- // (performance-reviewer-iter1-1) -----
323+ // The new design (fresh monotonic_buffer_resource per call) removes
324+ // this flaw and makes the bench an honest proof of the zero-heap-alloc
325+ // guarantee.
326+ //
327+ // buf is declared OUTSIDE the lambda (performance-reviewer-iter1-2):
328+ // zero-initialising a 65536-byte stack array inside the inner loop
329+ // would touch 64 KiB of cache traffic per iteration (64 GB/round),
330+ // swamping the measured pmr/string cost. Instead, buf is zeroed ONCE
331+ // before the timed loop and the lambda constructs a fresh
332+ // monotonic_buffer_resource from the same backing storage on every
333+ // call (cheap: just sets two internal pointers), which resets the
334+ // bump-pointer without re-zeroing the memory. The arena writes over
335+ // stale bytes from the previous call, which is correct. -----
317336 {
318337 using httpserver::detail::http_request_impl;
319338 using httpserver::detail::arguments_accumulator;
@@ -325,17 +344,20 @@ int main() {
325344 static const char * kValue =
326345 " a%2Fbcdefghijklmnopqrstuvwxyz_padding_to_force_heap" ;
327346
347+ // Backing buffer zeroed once; reused across all inner iterations
348+ // by constructing a fresh monotonic_buffer_resource each call.
349+ alignas (std::max_align_t ) std::array<std::byte, 65536 > buf5{};
350+ do_not_optimize (buf5); // prevent compiler from eliding the buffer
351+
328352 std::printf (" bench_warm_path (5): build_request_args "
329353 " (%%2F unescape via arena, fresh impl per call)\n " );
330354 med_build_args_pct2f = measure_median_ns (
331355 " build_request_args_pct2f" ,
332356 [&]() {
333- // Each call: fresh arena-backed impl, one insert, destroy.
334- // This mirrors the production per-request lifecycle and
335- // keeps the arena within capacity on every call.
336- alignas (std::max_align_t ) std::array<std::byte, 65536 > buf{};
357+ // Fresh monotonic_buffer_resource resets the bump pointer
358+ // to buf5.data() without touching the bytes -- cheap.
337359 std::pmr::monotonic_buffer_resource arena (
338- buf .data (), buf .size (), std::pmr::new_delete_resource ());
360+ buf5 .data (), buf5 .size (), std::pmr::null_memory_resource ());
339361 impl_alloc_t alloc (&arena);
340362 auto * p = alloc.new_object <http_request_impl>(
341363 nullptr , nullptr , alloc);
@@ -356,7 +378,8 @@ int main() {
356378 // Same fresh-impl-per-call structure as (5) so the timings are
357379 // directly comparable. The median for (5) should land within noise
358380 // of this baseline once TASK-072 lands.
359- // (performance-reviewer-iter1-1) -----
381+ // buf declared outside the lambda for the same reason as (5) above
382+ // (performance-reviewer-iter1-2). -----
360383 {
361384 using httpserver::detail::http_request_impl;
362385 using httpserver::detail::arguments_accumulator;
@@ -366,14 +389,16 @@ int main() {
366389 static const char * kValue =
367390 " abcdefghijklmnopqrstuvwxyz_no_escape_baseline_padding" ;
368391
392+ alignas (std::max_align_t ) std::array<std::byte, 65536 > buf6{};
393+ do_not_optimize (buf6);
394+
369395 std::printf (" bench_warm_path (6): build_request_args "
370396 " (no-escape baseline, fresh impl per call)\n " );
371397 med_build_args_plain = measure_median_ns (
372398 " build_request_args_plain" ,
373399 [&]() {
374- alignas (std::max_align_t ) std::array<std::byte, 65536 > buf{};
375400 std::pmr::monotonic_buffer_resource arena (
376- buf .data (), buf .size (), std::pmr::new_delete_resource ());
401+ buf6 .data (), buf6 .size (), std::pmr::null_memory_resource ());
377402 impl_alloc_t alloc (&arena);
378403 auto * p = alloc.new_object <http_request_impl>(
379404 nullptr , nullptr , alloc);
@@ -394,23 +419,39 @@ int main() {
394419 // Each median is compared against its committed per-platform baseline
395420 // (bench_baseline.hpp). A median more than kAllowedRegressionRatio
396421 // (5%) over baseline fails the bench.
422+ //
423+ // Noise floor: for sub-10 ns measurements (e.g. WARM_SHOULD_SKIP_AUTH_EMPTY_NS
424+ // baseline = 2 ns) a pure 5% ratio produces an allowed window of only 0.1 ns,
425+ // which is below steady_clock resolution on many platforms. We add an absolute
426+ // additive floor so the gate ceiling is at least baseline + kAbsoluteNoiseFloorNs
427+ // regardless of the ratio, matching the pattern used in bench_hook_overhead.
428+ // (test-quality-reviewer-iter1-2, performance-reviewer-iter1-3)
429+ constexpr double kAbsoluteNoiseFloorNs = 5.0 ;
430+
397431 namespace bb = httpserver::bench_baseline;
398432 std::printf (" \n bench_warm_path summary (baselines from "
399- " bench_baseline.hpp, +%.0f%% allowed):\n " ,
400- 100.0 * (bb::kAllowedRegressionRatio - 1.0 ));
433+ " bench_baseline.hpp, +%.0f%% or +%.0f ns floor allowed):\n " ,
434+ 100.0 * (bb::kAllowedRegressionRatio - 1.0 ),
435+ kAbsoluteNoiseFloorNs );
401436
402437 int rc = 0 ;
403438 const auto check = [&](const char * label, double measured,
404439 double baseline) {
405- const double allowed = baseline * bb::kAllowedRegressionRatio ;
440+ // Use the larger of the ratio-based ceiling and the absolute noise
441+ // floor so sub-ns timer quantization cannot false-trip the gate.
442+ const double ratio_ceiling = baseline * bb::kAllowedRegressionRatio ;
443+ const double floor_ceiling = baseline + kAbsoluteNoiseFloorNs ;
444+ const double allowed = std::max (ratio_ceiling, floor_ceiling);
406445 const double pct = 100.0 * (measured / baseline - 1.0 );
407446 std::printf (" %-26s median=%8.3f ns baseline=%8.3f ns %+6.1f%%\n " ,
408447 label, measured, baseline, pct);
409448 if (measured > allowed) {
410- std::printf (" FAIL: %s median %.3f ns exceeds baseline*%.2f = "
411- " %.3f ns (regression %+.1f%%)\n " ,
412- label, measured, bb::kAllowedRegressionRatio ,
413- allowed, pct);
449+ std::printf (" FAIL: %s median %.3f ns exceeds gate ceiling %.3f ns "
450+ " (baseline*%.2f=%.3f, baseline+%.0fns=%.3f, "
451+ " regression %+.1f%%)\n " ,
452+ label, measured, allowed,
453+ bb::kAllowedRegressionRatio , ratio_ceiling,
454+ kAbsoluteNoiseFloorNs , floor_ceiling, pct);
414455 rc = 1 ;
415456 }
416457 };
@@ -427,8 +468,10 @@ int main() {
427468 bb::WARM_BUILD_REQUEST_ARGS_PLAIN_NS );
428469
429470 if (rc == 0 ) {
430- std::printf (" PASS: all warm-path medians within %.0f%% of baseline\n " ,
431- 100.0 * (bb::kAllowedRegressionRatio - 1.0 ));
471+ std::printf (" PASS: all warm-path medians within gate "
472+ " (+%.0f%% or +%.0fns floor over baseline)\n " ,
473+ 100.0 * (bb::kAllowedRegressionRatio - 1.0 ),
474+ kAbsoluteNoiseFloorNs );
432475 }
433476 return rc;
434477}
0 commit comments