From f0e6000f3f250f22997716ddcc1a968ccaff71dc Mon Sep 17 00:00:00 2001 From: Steve Gerbino Date: Fri, 29 May 2026 16:28:46 -0700 Subject: [PATCH] fix(iocp): bound wait reactor WSAPoll timeout to prevent lost-wakeup hang The auxiliary wait reactor blocked in WSAPoll(-1, infinite), relying entirely on the self-pipe wakeup. wake_self() coalesces wakes via the wake_pending_ flag and ignores send()'s return value, so a failed or lost wakeup leaves wake_pending_ stuck true: every subsequent wake is coalesced away and the reactor never re-checks pending_register_ / pending_cancel_ / stop_. A newly registered wait fd then never enters the poll set and its readiness is never detected, hanging ioc.run() forever. This surfaced as Windows coverage-build (gcc + gcov) timeouts in the local_stream_socket.iocp, native.local_stream_socket.iocp, and wait.iocp suites, whose newly enabled local-stream-on-IOCP tests exercise acceptor wait readiness through the reactor. The heavy gcov instrumentation widens the timing window; the regular (clang/msvc) CI and uninstrumented builds pass. Use a bounded 500 ms WSAPoll timeout as a safety net so a missed wakeup costs at most one poll interval of latency instead of a permanent hang. This mirrors the existing 500 ms GQCS safety timeout in win_scheduler. --- .../corosio/native/detail/iocp/win_wait_reactor.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/boost/corosio/native/detail/iocp/win_wait_reactor.hpp b/include/boost/corosio/native/detail/iocp/win_wait_reactor.hpp index 89e964cf..2495ecd9 100644 --- a/include/boost/corosio/native/detail/iocp/win_wait_reactor.hpp +++ b/include/boost/corosio/native/detail/iocp/win_wait_reactor.hpp @@ -326,10 +326,17 @@ win_wait_reactor::run() for (auto& e : registered_) pollfds.push_back({e.fd, events_for_wait(e.w), 0}); + // Bounded timeout rather than infinite: this is a safety net + // against a lost self-pipe wakeup (e.g. a failed/coalesced + // send in wake_self leaving wake_pending_ stuck true). On + // timeout the loop re-drains pending_register_/pending_cancel_ + // and re-checks stop_, so a missed wakeup costs at most one + // poll interval of latency instead of a permanent hang. This + // mirrors the 500 ms GQCS safety timeout in win_scheduler. int n = ::WSAPoll( pollfds.data(), static_cast(pollfds.size()), - -1 /* infinite */); + 500 /* ms */); if (n == SOCKET_ERROR) break;