From 035e92a3fc949e48931a3e2cb05baf7b98aa4915 Mon Sep 17 00:00:00 2001 From: Trung Date: Thu, 25 Jun 2026 22:41:02 +0700 Subject: [PATCH] Emulated Netlink sockets cause infinite busy loops under event-loop When guest applications (such as avahi-daemon) run event loops that wait for netlink messages using ppoll() or select(), the emulated descriptor immediately returns POLLIN because the write end is closed. The application's subsequent recvmsg or read call returns 0 bytes. Because no sender credentials can be resolved from a 0-byte read, the application ignores the read and immediately queries ppoll() again. This causes the guest application to spin in an infinite busy-loop consuming 100% CPU. To address this, implement a non-blocking self-pipe signaling mechanism inside src/syscall/netlink.c: 1. Store and use pipe_rd in netlink_state_t directly to avoid lockless global state reads in netlink_clear_readable. 2. Honor MSG_DONTWAIT and O_NONBLOCK flags and implement proper blocking semantics by polling the pipe outside nl_lock. 3. Signal readability only on empty-to-nonempty transition to prevent pipe buffer drift. 4. Handle zero-length reads/recvs before empty-buffer check. Close #105 --- src/syscall/net.c | 2 +- src/syscall/net.h | 1 + src/syscall/netlink.c | 182 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 172 insertions(+), 13 deletions(-) diff --git a/src/syscall/net.c b/src/syscall/net.c index 2aabd92..30de7a5 100644 --- a/src/syscall/net.c +++ b/src/syscall/net.c @@ -760,7 +760,7 @@ int64_t sys_recvfrom(guest_t *g, uint64_t addrlen_gva) { if (fd_get_type(fd) == FD_NETLINK) - return netlink_recv(fd, g, buf_gva, len, src_gva, addrlen_gva); + return netlink_recv(fd, g, buf_gva, len, flags, src_gva, addrlen_gva); host_fd_ref_t host_ref; if (host_fd_ref_open(fd, &host_ref) < 0) diff --git a/src/syscall/net.h b/src/syscall/net.h index 7a19be8..d16290d 100644 --- a/src/syscall/net.h +++ b/src/syscall/net.h @@ -203,6 +203,7 @@ int64_t netlink_recv(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len, + int flags, uint64_t src_gva, uint64_t addrlen_gva); diff --git a/src/syscall/netlink.c b/src/syscall/netlink.c index c928b36..fb6bacc 100644 --- a/src/syscall/netlink.c +++ b/src/syscall/netlink.c @@ -33,6 +33,12 @@ #include "syscall/abi.h" #include "syscall/internal.h" #include "syscall/net.h" +#include "utils.h" +#include + +#ifndef LINUX_MSG_DONTWAIT +#define LINUX_MSG_DONTWAIT 0x40 +#endif static void netlink_close(int guest_fd); @@ -130,6 +136,8 @@ typedef struct { size_t buf_pos; /* Current read position */ uint32_t seq; /* Sequence number from last request */ uint32_t pid; /* Bound PID (from bind or auto-assigned) */ + int pipe_wr; /* Host pipe write descriptor */ + int pipe_rd; /* Host pipe read descriptor */ } netlink_state_t; static netlink_state_t nl_state[MAX_NETLINK_FDS]; @@ -156,6 +164,8 @@ static netlink_state_t *nl_alloc(int guest_fd) memset(s, 0, sizeof(*s)); s->in_use = true; s->guest_fd = guest_fd; + s->pipe_wr = -1; + s->pipe_rd = -1; s->buf = malloc(NETLINK_BUF_SIZE); if (!s->buf) { s->in_use = false; @@ -167,6 +177,26 @@ static netlink_state_t *nl_alloc(int guest_fd) return NULL; } +static void netlink_signal_readable(netlink_state_t *ns) +{ + if (ns->pipe_wr != -1) { + uint8_t dummy = 1; + (void) write(ns->pipe_wr, &dummy, 1); + } +} + +static void netlink_clear_readable(netlink_state_t *ns) +{ + int host_fd = ns->pipe_rd; + if (host_fd < 0) + return; + + uint8_t dummy[128]; + while (read(host_fd, dummy, sizeof(dummy)) > 0) { + /* Drain non-blocking pipe */ + } +} + /* Append a netlink attribute to the buffer. Returns bytes written. */ static size_t nl_put_attr(uint8_t *buf, size_t max, @@ -423,6 +453,12 @@ int64_t netlink_socket(int protocol, int type) if (pipe(pipefd) < 0) return -LINUX_EMFILE; + if (fd_set_nonblock(pipefd[0]) < 0 || fd_set_nonblock(pipefd[1]) < 0) { + close(pipefd[0]); + close(pipefd[1]); + return -LINUX_EMFILE; + } + int gfd = fd_alloc(FD_NETLINK, pipefd[0], netlink_close); if (gfd < 0) { close(pipefd[0]); @@ -438,10 +474,8 @@ int64_t netlink_socket(int protocol, int type) return -LINUX_ENOMEM; } - /* No poll wakeup fd is needed because recvmsg drains the buffered response - * directly. - */ - close(pipefd[1]); /* The write end is unused */ + ns->pipe_wr = pipefd[1]; + ns->pipe_rd = pipefd[0]; return gfd; } @@ -604,7 +638,12 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) goto out; } + bool was_empty = ns->buf_pos >= ns->buf_len; int ret = nl_process_request(ns, req, rlen); + if (ret == 0) { + if (was_empty && ns->buf_pos < ns->buf_len) + netlink_signal_readable(ns); + } result = (ret < 0) ? ret : (int64_t) iov.iov_len; out: @@ -635,7 +674,12 @@ int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len) goto out; } + bool was_empty = ns->buf_pos >= ns->buf_len; int ret = nl_process_request(ns, req, rlen); + if (ret == 0) { + if (was_empty && ns->buf_pos < ns->buf_len) + netlink_signal_readable(ns); + } result = (ret < 0) ? ret : (int64_t) len; out: @@ -650,6 +694,7 @@ int64_t netlink_recv(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len, + int flags, uint64_t src_gva, uint64_t addrlen_gva) { @@ -660,11 +705,44 @@ int64_t netlink_recv(int guest_fd, return -LINUX_EBADF; } - if (ns->buf_pos >= ns->buf_len) { + if (len == 0) { pthread_mutex_unlock(&nl_lock); return 0; } + /* Wait for data to become available. If the buffer is empty, block + * on the host pipe read end. Honor MSG_DONTWAIT and O_NONBLOCK flags. + */ + while (ns->buf_pos >= ns->buf_len) { + bool nonblock = (flags & LINUX_MSG_DONTWAIT) || + (fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK); + if (nonblock) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EAGAIN; + } + + int rd_fd = ns->pipe_rd; + pthread_mutex_unlock(&nl_lock); + + struct pollfd pfd = { + .fd = rd_fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, -1); + if (ret < 0) { + if (errno == EINTR) + return -LINUX_EINTR; + return -LINUX_EIO; + } + + pthread_mutex_lock(&nl_lock); + netlink_state_t *current_ns = nl_find(guest_fd); + if (!current_ns || current_ns != ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + } + size_t avail = ns->buf_len - ns->buf_pos; size_t to_copy = (avail < len) ? avail : len; @@ -689,6 +767,9 @@ int64_t netlink_recv(int guest_fd, } ns->buf_pos += msg_end; + if (ns->buf_pos >= ns->buf_len) + netlink_clear_readable(ns); + if (src_gva && addrlen_gva) { sockaddr_nl_t snl = { .nl_family = LINUX_AF_NETLINK, @@ -738,7 +819,6 @@ int64_t netlink_getsockname(int guest_fd, int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) { - (void) flags; pthread_mutex_lock(&nl_lock); netlink_state_t *ns = nl_find(guest_fd); if (!ns) { @@ -746,11 +826,6 @@ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) return -LINUX_EBADF; } - if (ns->buf_pos >= ns->buf_len) { - pthread_mutex_unlock(&nl_lock); - return 0; - } - /* Parse msghdr to get iovec */ linux_msghdr_t mhdr; if (guest_read_small(g, msg_gva, &mhdr, sizeof(mhdr)) < 0) { @@ -771,6 +846,44 @@ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) return -LINUX_EFAULT; } + if (iov.iov_len == 0) { + pthread_mutex_unlock(&nl_lock); + return 0; + } + + /* Wait for data to become available. If the buffer is empty, block + * on the host pipe read end. Honor MSG_DONTWAIT and O_NONBLOCK flags. + */ + while (ns->buf_pos >= ns->buf_len) { + bool nonblock = (flags & LINUX_MSG_DONTWAIT) || + (fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK); + if (nonblock) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EAGAIN; + } + + int rd_fd = ns->pipe_rd; + pthread_mutex_unlock(&nl_lock); + + struct pollfd pfd = { + .fd = rd_fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, -1); + if (ret < 0) { + if (errno == EINTR) + return -LINUX_EINTR; + return -LINUX_EIO; + } + + pthread_mutex_lock(&nl_lock); + netlink_state_t *current_ns = nl_find(guest_fd); + if (!current_ns || current_ns != ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + } + size_t avail = ns->buf_len - ns->buf_pos; size_t to_copy = (avail < iov.iov_len) ? avail : iov.iov_len; @@ -804,6 +917,9 @@ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags) ns->buf_pos += msg_end; + if (ns->buf_pos >= ns->buf_len) + netlink_clear_readable(ns); + /* Write back sockaddr_nl if caller provided msg_name */ if (mhdr.msg_name && mhdr.msg_namelen >= sizeof(sockaddr_nl_t)) { sockaddr_nl_t snl = { @@ -837,11 +953,44 @@ int64_t netlink_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) return -LINUX_EBADF; } - if (ns->buf_pos >= ns->buf_len) { + if (count == 0) { pthread_mutex_unlock(&nl_lock); return 0; } + /* Wait for data to become available. If the buffer is empty, block + * on the host pipe read end. Honor O_NONBLOCK flag. + */ + while (ns->buf_pos >= ns->buf_len) { + bool nonblock = + (fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK) != 0; + if (nonblock) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EAGAIN; + } + + int rd_fd = ns->pipe_rd; + pthread_mutex_unlock(&nl_lock); + + struct pollfd pfd = { + .fd = rd_fd, + .events = POLLIN, + }; + int ret = poll(&pfd, 1, -1); + if (ret < 0) { + if (errno == EINTR) + return -LINUX_EINTR; + return -LINUX_EIO; + } + + pthread_mutex_lock(&nl_lock); + netlink_state_t *current_ns = nl_find(guest_fd); + if (!current_ns || current_ns != ns) { + pthread_mutex_unlock(&nl_lock); + return -LINUX_EBADF; + } + } + size_t avail = ns->buf_len - ns->buf_pos; size_t to_copy = (avail < count) ? avail : count; @@ -851,6 +1000,10 @@ int64_t netlink_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count) } ns->buf_pos += to_copy; + + if (ns->buf_pos >= ns->buf_len) + netlink_clear_readable(ns); + pthread_mutex_unlock(&nl_lock); return (int64_t) to_copy; } @@ -863,6 +1016,11 @@ static void netlink_close(int guest_fd) pthread_mutex_unlock(&nl_lock); return; } + if (ns->pipe_wr != -1) { + close(ns->pipe_wr); + ns->pipe_wr = -1; + } + ns->pipe_rd = -1; free(ns->buf); ns->buf = NULL; ns->in_use = false;