Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/syscall/net.c
Original file line number Diff line number Diff line change
Expand Up @@ -760,7 +760,7 @@ int64_t sys_recvfrom(guest_t *g,
uint64_t addrlen_gva)
{
if (fd_get_type(fd) == FD_NETLINK)
return netlink_recv(fd, g, buf_gva, len, src_gva, addrlen_gva);
return netlink_recv(fd, g, buf_gva, len, flags, src_gva, addrlen_gva);

host_fd_ref_t host_ref;
if (host_fd_ref_open(fd, &host_ref) < 0)
Expand Down
1 change: 1 addition & 0 deletions src/syscall/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ int64_t netlink_recv(int guest_fd,
guest_t *g,
uint64_t buf_gva,
uint64_t len,
int flags,
uint64_t src_gva,
uint64_t addrlen_gva);

Expand Down
182 changes: 170 additions & 12 deletions src/syscall/netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@
#include "syscall/abi.h"
#include "syscall/internal.h"
#include "syscall/net.h"
#include "utils.h"
#include <poll.h>

#ifndef LINUX_MSG_DONTWAIT
#define LINUX_MSG_DONTWAIT 0x40
#endif

static void netlink_close(int guest_fd);

Expand Down Expand Up @@ -130,6 +136,8 @@ typedef struct {
size_t buf_pos; /* Current read position */
uint32_t seq; /* Sequence number from last request */
uint32_t pid; /* Bound PID (from bind or auto-assigned) */
int pipe_wr; /* Host pipe write descriptor */
int pipe_rd; /* Host pipe read descriptor */
} netlink_state_t;

static netlink_state_t nl_state[MAX_NETLINK_FDS];
Expand All @@ -156,6 +164,8 @@ static netlink_state_t *nl_alloc(int guest_fd)
memset(s, 0, sizeof(*s));
s->in_use = true;
s->guest_fd = guest_fd;
s->pipe_wr = -1;
s->pipe_rd = -1;
s->buf = malloc(NETLINK_BUF_SIZE);
if (!s->buf) {
s->in_use = false;
Expand All @@ -167,6 +177,26 @@ static netlink_state_t *nl_alloc(int guest_fd)
return NULL;
}

static void netlink_signal_readable(netlink_state_t *ns)
{
if (ns->pipe_wr != -1) {
uint8_t dummy = 1;
(void) write(ns->pipe_wr, &dummy, 1);
}
}

static void netlink_clear_readable(netlink_state_t *ns)
{
int host_fd = ns->pipe_rd;
if (host_fd < 0)
return;

uint8_t dummy[128];
while (read(host_fd, dummy, sizeof(dummy)) > 0) {
/* Drain non-blocking pipe */
}
}

/* Append a netlink attribute to the buffer. Returns bytes written. */
static size_t nl_put_attr(uint8_t *buf,
size_t max,
Expand Down Expand Up @@ -423,6 +453,12 @@ int64_t netlink_socket(int protocol, int type)
if (pipe(pipefd) < 0)
return -LINUX_EMFILE;

if (fd_set_nonblock(pipefd[0]) < 0 || fd_set_nonblock(pipefd[1]) < 0) {

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fd_set_nonblock() failure returns -LINUX_EMFILE, which is misleading -- this is an fcntl failure, not fd exhaustion. Preserve the real errno (return -linux_errno() from the failing call) after closing both pipe fds.

close(pipefd[0]);
close(pipefd[1]);
return -LINUX_EMFILE;
}

int gfd = fd_alloc(FD_NETLINK, pipefd[0], netlink_close);
if (gfd < 0) {
close(pipefd[0]);
Expand All @@ -438,10 +474,8 @@ int64_t netlink_socket(int protocol, int type)
return -LINUX_ENOMEM;
}

/* No poll wakeup fd is needed because recvmsg drains the buffered response
* directly.
*/
close(pipefd[1]); /* The write end is unused */
ns->pipe_wr = pipefd[1];
ns->pipe_rd = pipefd[0];

return gfd;
}
Expand Down Expand Up @@ -604,7 +638,12 @@ int64_t netlink_sendmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
goto out;
}

bool was_empty = ns->buf_pos >= ns->buf_len;
int ret = nl_process_request(ns, req, rlen);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

netlink_signal_readable() writes a token on every successful send. Since each request overwrites ns->buf (buf_pos reset to 0), repeated sends without an intervening recv keep pushing bytes into the pipe until it fills and the writes silently drop on EAGAIN. Harmless today, but it makes the pipe level drift from ns->buf state and can mask a real missed-wakeup later. Signal only on the empty-to-nonempty transition: capture bool was_empty = ns->buf_pos >= ns->buf_len before nl_process_request() and write only when it goes non-empty. Same at line 667 (netlink_send).

if (ret == 0) {
if (was_empty && ns->buf_pos < ns->buf_len)
netlink_signal_readable(ns);
}
result = (ret < 0) ? ret : (int64_t) iov.iov_len;

out:
Expand Down Expand Up @@ -635,7 +674,12 @@ int64_t netlink_send(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t len)
goto out;
}

bool was_empty = ns->buf_pos >= ns->buf_len;
int ret = nl_process_request(ns, req, rlen);
if (ret == 0) {
if (was_empty && ns->buf_pos < ns->buf_len)
netlink_signal_readable(ns);
}
result = (ret < 0) ? ret : (int64_t) len;

out:
Expand All @@ -650,6 +694,7 @@ int64_t netlink_recv(int guest_fd,
guest_t *g,
uint64_t buf_gva,
uint64_t len,
int flags,
uint64_t src_gva,
uint64_t addrlen_gva)
{
Expand All @@ -660,11 +705,44 @@ int64_t netlink_recv(int guest_fd,
return -LINUX_EBADF;
}

if (ns->buf_pos >= ns->buf_len) {
if (len == 0) {
pthread_mutex_unlock(&nl_lock);
return 0;
}

/* Wait for data to become available. If the buffer is empty, block
* on the host pipe read end. Honor MSG_DONTWAIT and O_NONBLOCK flags.
*/
while (ns->buf_pos >= ns->buf_len) {
bool nonblock = (flags & LINUX_MSG_DONTWAIT) ||
(fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK);
Comment on lines +716 to +718

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add proper comments.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added

if (nonblock) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EAGAIN;
}

int rd_fd = ns->pipe_rd;
pthread_mutex_unlock(&nl_lock);

struct pollfd pfd = {
.fd = rd_fd,
.events = POLLIN,
};
int ret = poll(&pfd, 1, -1);
if (ret < 0) {
if (errno == EINTR)
return -LINUX_EINTR;
return -LINUX_EIO;
}

pthread_mutex_lock(&nl_lock);
netlink_state_t *current_ns = nl_find(guest_fd);
if (!current_ns || current_ns != ns) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EBADF;
}
}

size_t avail = ns->buf_len - ns->buf_pos;
size_t to_copy = (avail < len) ? avail : len;

Expand All @@ -689,6 +767,9 @@ int64_t netlink_recv(int guest_fd,
}
ns->buf_pos += msg_end;

if (ns->buf_pos >= ns->buf_len)
netlink_clear_readable(ns);

if (src_gva && addrlen_gva) {
sockaddr_nl_t snl = {
.nl_family = LINUX_AF_NETLINK,
Expand Down Expand Up @@ -738,19 +819,13 @@ int64_t netlink_getsockname(int guest_fd,

int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
{
(void) flags;
pthread_mutex_lock(&nl_lock);
netlink_state_t *ns = nl_find(guest_fd);
if (!ns) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EBADF;
}

if (ns->buf_pos >= ns->buf_len) {
pthread_mutex_unlock(&nl_lock);
return 0;
}

/* Parse msghdr to get iovec */
linux_msghdr_t mhdr;
if (guest_read_small(g, msg_gva, &mhdr, sizeof(mhdr)) < 0) {
Expand All @@ -771,6 +846,44 @@ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)
return -LINUX_EFAULT;
}

if (iov.iov_len == 0) {
pthread_mutex_unlock(&nl_lock);
return 0;
}

/* Wait for data to become available. If the buffer is empty, block
* on the host pipe read end. Honor MSG_DONTWAIT and O_NONBLOCK flags.
*/
while (ns->buf_pos >= ns->buf_len) {
bool nonblock = (flags & LINUX_MSG_DONTWAIT) ||
(fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK);
if (nonblock) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EAGAIN;
}

int rd_fd = ns->pipe_rd;
pthread_mutex_unlock(&nl_lock);

struct pollfd pfd = {
.fd = rd_fd,
.events = POLLIN,
};
int ret = poll(&pfd, 1, -1);
if (ret < 0) {
if (errno == EINTR)
return -LINUX_EINTR;
return -LINUX_EIO;
}

pthread_mutex_lock(&nl_lock);
netlink_state_t *current_ns = nl_find(guest_fd);
if (!current_ns || current_ns != ns) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EBADF;
}
}

size_t avail = ns->buf_len - ns->buf_pos;
size_t to_copy = (avail < iov.iov_len) ? avail : iov.iov_len;

Expand Down Expand Up @@ -804,6 +917,9 @@ int64_t netlink_recvmsg(int guest_fd, guest_t *g, uint64_t msg_gva, int flags)

ns->buf_pos += msg_end;

if (ns->buf_pos >= ns->buf_len)
netlink_clear_readable(ns);

/* Write back sockaddr_nl if caller provided msg_name */
if (mhdr.msg_name && mhdr.msg_namelen >= sizeof(sockaddr_nl_t)) {
sockaddr_nl_t snl = {
Expand Down Expand Up @@ -837,11 +953,44 @@ int64_t netlink_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
return -LINUX_EBADF;
}

if (ns->buf_pos >= ns->buf_len) {
if (count == 0) {
pthread_mutex_unlock(&nl_lock);
return 0;
}

/* Wait for data to become available. If the buffer is empty, block
* on the host pipe read end. Honor O_NONBLOCK flag.
*/
while (ns->buf_pos >= ns->buf_len) {
bool nonblock =
(fd_table[guest_fd].linux_flags & LINUX_O_NONBLOCK) != 0;
if (nonblock) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EAGAIN;
}

int rd_fd = ns->pipe_rd;
pthread_mutex_unlock(&nl_lock);

struct pollfd pfd = {
.fd = rd_fd,
.events = POLLIN,
};
int ret = poll(&pfd, 1, -1);
if (ret < 0) {
if (errno == EINTR)
return -LINUX_EINTR;
return -LINUX_EIO;
}

pthread_mutex_lock(&nl_lock);
netlink_state_t *current_ns = nl_find(guest_fd);
if (!current_ns || current_ns != ns) {
pthread_mutex_unlock(&nl_lock);
return -LINUX_EBADF;
}
}

size_t avail = ns->buf_len - ns->buf_pos;
size_t to_copy = (avail < count) ? avail : count;

Expand All @@ -851,6 +1000,10 @@ int64_t netlink_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
}

ns->buf_pos += to_copy;

if (ns->buf_pos >= ns->buf_len)
netlink_clear_readable(ns);

pthread_mutex_unlock(&nl_lock);
return (int64_t) to_copy;
}
Expand All @@ -863,6 +1016,11 @@ static void netlink_close(int guest_fd)
pthread_mutex_unlock(&nl_lock);
return;
}
if (ns->pipe_wr != -1) {
close(ns->pipe_wr);
ns->pipe_wr = -1;
}
ns->pipe_rd = -1;
free(ns->buf);
ns->buf = NULL;
ns->in_use = false;
Expand Down
Loading