From e01744f2eeda232a849addb6545d7459c58e0e61 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 1 Jun 2026 10:08:44 +1000 Subject: [PATCH 1/4] *: remove broken O_RESOLVE_BENEATH fallback for Linux On Linux, openat(2) famously does not return an error if you pass it invalid flag bits. This was something I fixed in openat2(2) but for backward compatibility reasons, openat(2) will always silently ignore unknown O_* flags (which makes adding new ones quite painful). As a result, the fallback path added by commits e1c5f0e93a75 ("t_chmod_secure: probe kernel RESOLVE_BENEATH at runtime; drop test skip") and commit 7f60ec001a0b ("syscall: also use O_RESOLVE_BENEATH on FreeBSD and MacOS") would always return sucess and could cause spurious test failures so we can just remove it. O_RESOLVE_BENEATH will never get added to Linux for this reason, and anyone doing so out-of-tree is just creating a massive foot-gun so there is no need to entertain such broken kernels. Fixes: 7f60ec001a0b ("syscall: also use O_RESOLVE_BENEATH on FreeBSD and MacOS") Fixes: e1c5f0e93a75 ("t_chmod_secure: probe kernel RESOLVE_BENEATH at runtime; drop test skip") Signed-off-by: Aleksa Sarai --- syscall.c | 4 +--- t_chmod_secure.c | 30 ++++++++++++++---------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/syscall.c b/syscall.c index 0748d9988..7b74388de 100644 --- a/syscall.c +++ b/syscall.c @@ -1815,9 +1815,7 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo if (fd != -1 || errno != ENOSYS) return fd; } -#endif - -#ifdef O_RESOLVE_BENEATH +#elif defined(O_RESOLVE_BENEATH) return secure_relative_open_resolve_beneath(basedir, relpath, flags, mode); #endif diff --git a/t_chmod_secure.c b/t_chmod_secure.c index 7c57dbbca..b8bf8f1e3 100644 --- a/t_chmod_secure.c +++ b/t_chmod_secure.c @@ -45,23 +45,21 @@ static int errs = 0; static int kernel_resolve_beneath_supported(void) { int fd; -#ifdef __linux__ - { - struct open_how how; - memset(&how, 0, sizeof how); - how.flags = O_RDONLY | O_DIRECTORY; - how.resolve = RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS; - fd = syscall(SYS_openat2, AT_FDCWD, ".", &how, sizeof how); - if (fd >= 0) { - close(fd); - return 1; - } - /* ENOSYS = kernel < 5.6. Fall through to the O_RESOLVE_BENEATH - * probe in case we're a Linux build running on a kernel that - * gained O_RESOLVE_BENEATH via some out-of-tree backport. */ +#if defined __linux__ + struct open_how how; + memset(&how, 0, sizeof how); + how.flags = O_RDONLY | O_DIRECTORY; + how.resolve = RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS; + fd = syscall(SYS_openat2, AT_FDCWD, ".", &how, sizeof how); + if (fd >= 0) { + close(fd); + return 1; } -#endif -#ifdef O_RESOLVE_BENEATH + /* O_RESOLVE_BENEATH is not defined on Linux, and even if it were, Linux's + * openat(2) does not return -EINVAL for unknown flag bits and so if + * O_RESOLVE_BENEATH happened to get defined somehow, the following + * fallback would always return success. */ +#elif defined O_RESOLVE_BENEATH fd = openat(AT_FDCWD, ".", O_RDONLY | O_DIRECTORY | O_RESOLVE_BENEATH); if (fd >= 0) { close(fd); From 95f7c8151c0accba60ca4ceb9782afe304581281 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 1 Jun 2026 10:27:30 +1000 Subject: [PATCH 2/4] syscalls: retry openat2 on -EAGAIN In order to avoid in-kernel DoS due to unbounded retries, openat2(2) will return -EAGAIN when trying to walk through ".." if there is any racing rename or mount on the entire system. Note that this applies regardless of whether the rename was on the same filesytem or mount was in the same mount namespace as the process doing openat(2) -- as a result, calling openat2(2) on even a modestly busy system will result in spurious -EAGAIN errors every once in a while and it is necessary to implement a retry loop for it. (Libraries such as libpathrs and heavy users of openat2(2) like runc all do this, and in our testing we found that ~256 iterations is enough to provide resilience even on incredibly rename-heavy machines.) Fixes: 4fa7156ccdb2 ("syscall: use openat2(RESOLVE_BENEATH) on Linux for secure_relative_open") Signed-off-by: Aleksa Sarai --- syscall.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/syscall.c b/syscall.c index 7b74388de..539952eae 100644 --- a/syscall.c +++ b/syscall.c @@ -1806,14 +1806,19 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo } #if defined(__linux__) && defined(HAVE_OPENAT2) - { + /* openat2(2) can fail with -EAGAIN if the path contains a ".." component + * and there was a rename or mount on the system, so on busy machines it is + * necessary to retry this a few times. Based on experiments in libpathrs, + * ~256 iterations is enough to ensure that ~50k openat2(2) runs on a very + * rename-heavy system never fail. */ + for (int tries = 0; tries < 256; tries++) { int fd = secure_relative_open_linux(basedir, relpath, flags, mode); - /* ENOSYS = kernel < 5.6 doesn't have the syscall even though - * glibc/kernel-headers do; fall through to the portable path. - * (Built unconditionally unless --disable-openat2, which forces - * the portable resolver below so that tier is exercised.) */ - if (fd != -1 || errno != ENOSYS) + if (fd != -1) return fd; + if (errno == ENOSYS) + break; // fallback to portable path + if (errno != EAGAIN) + return -1; } #elif defined(O_RESOLVE_BENEATH) return secure_relative_open_resolve_beneath(basedir, relpath, flags, mode); From 979229384141f274f71a01a47bab48228630c3e6 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 1 Jun 2026 19:50:21 +1000 Subject: [PATCH 3/4] t_secure_relpath: make test wrapper more generic In the next patch we will permit ".." in certain cases and change some of the returned error codes, so this is a preparatory patch to make the test helpers more generic. check_relpath() and check_basedir() also identical logic apart from the arguments passed to secure_relative_open() and so unifying them also makes it easier to understand what arguments are being passed to secure_relative_open(). Signed-off-by: Aleksa Sarai --- t_secure_relpath.c | 81 ++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 53 deletions(-) diff --git a/t_secure_relpath.c b/t_secure_relpath.c index a0fdf0d25..99a4a0a5d 100644 --- a/t_secure_relpath.c +++ b/t_secure_relpath.c @@ -41,62 +41,36 @@ short info_levels[COUNT_INFO], debug_levels[COUNT_DEBUG]; static int errs = 0; -static void check_relpath(const char *relpath) +static void check_relative_open(const char *basedir, const char *relpath, + int want_errno) { int fd; int saved_errno; errno = 0; - fd = secure_relative_open(NULL, relpath, O_RDONLY | O_DIRECTORY, 0); + fd = secure_relative_open(basedir, relpath, O_RDONLY | O_DIRECTORY, 0); saved_errno = errno; if (fd >= 0) { - fprintf(stderr, - "FAIL [relpath=%-12s]: returned valid fd %d (escape) -- expected -1 EINVAL\n", - relpath, fd); close(fd); - errs++; - return; - } - - if (saved_errno != EINVAL) { + if (want_errno != 0) { + fprintf(stderr, + "FAIL [basedir=%-12s relpath=%-12s]: returned valid fd %d (escape) -- expected -1 errno=%d (%s)\n", + basedir, relpath, fd, want_errno, strerror(want_errno)); + errs++; + return; + } + } else if (saved_errno != want_errno) { fprintf(stderr, - "FAIL [relpath=%-12s]: rejected but errno=%d (%s), expected EINVAL\n", - relpath, saved_errno, strerror(saved_errno)); + "FAIL [basedir=%-12s relpath=%-12s]: rejected but errno=%d (%s), expected errno=%d (%s)\n", + basedir, relpath, saved_errno, strerror(saved_errno), want_errno, strerror(want_errno)); errs++; return; } - fprintf(stderr, "OK [relpath=%-12s]: rejected with EINVAL\n", relpath); -} - -static void check_basedir(const char *basedir) -{ - int fd; - int saved_errno; - - errno = 0; - fd = secure_relative_open(basedir, "ok", O_RDONLY | O_DIRECTORY, 0); - saved_errno = errno; - - if (fd >= 0) { - fprintf(stderr, - "FAIL [basedir=%-12s]: returned valid fd %d -- expected -1 EINVAL\n", - basedir, fd); - close(fd); - errs++; - return; - } - - if (saved_errno != EINVAL) { - fprintf(stderr, - "FAIL [basedir=%-12s]: rejected but errno=%d (%s), expected EINVAL\n", - basedir, saved_errno, strerror(saved_errno)); - errs++; - return; - } - - fprintf(stderr, "OK [basedir=%-12s]: rejected with EINVAL\n", basedir); + fprintf(stderr, + "OK [basedir=%-12s relpath=%-12s]: rejected with errno %d (%s)\n", + basedir, relpath, saved_errno, strerror(saved_errno)); } int main(int argc, char **argv) @@ -110,6 +84,7 @@ int main(int argc, char **argv) return 2; } + /* secure_relative_open's daemon-only confinement protections only * fire when am_daemon && !am_chrooted (the threat model is the * daemon-no-chroot deployment), but the front-door input @@ -128,22 +103,22 @@ int main(int argc, char **argv) * is suspicious and the caller should normalise before passing * it in. The "../foo" / "foo/../bar" / "/foo" / "/" cases are * regression checks for the existing checks. */ - check_relpath(".."); - check_relpath("../foo"); - check_relpath("subdir/.."); - check_relpath("subdir/../subdir"); - check_relpath("foo/../bar"); - check_relpath("/foo"); - check_relpath("/"); + check_relative_open(NULL, "..", EINVAL); + check_relative_open(NULL, "../foo", EINVAL); + check_relative_open(NULL, "subdir/..", EINVAL); + check_relative_open(NULL, "subdir/../subdir", EINVAL); + check_relative_open(NULL, "foo/../bar", EINVAL); + check_relative_open(NULL, "/foo", EINVAL); + check_relative_open(NULL, "/", EINVAL); /* Same checks against basedir (which the codex Finding 2 fix * routes through the same RESOLVE_BENEATH-equivalent). Absolute * basedirs are operator-trusted and intentionally not validated * here. */ - check_basedir(".."); - check_basedir("../subdir"); - check_basedir("subdir/.."); - check_basedir("foo/../bar"); + check_relative_open("..", "ok", EINVAL); + check_relative_open("../subdir", "ok", EINVAL); + check_relative_open("subdir/..", "ok", EINVAL); + check_relative_open("foo/../bar", "ok", EINVAL); if (errs) fprintf(stderr, "\n%d failure(s)\n", errs); From 7c7aacdf95309e63b4875bd4658c1bc1a81476d2 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 1 Jun 2026 11:01:51 +1000 Subject: [PATCH 4/4] syscall: remove unneeded path_has_dotdot_component checks This code was added in commit 30656c5e358b ("syscall: add symlink-race-safe do_*_at() wrappers and harden secure_relative_open") but the justification given is really not particularly convincing. The stated justification is that RESOLVE_BENEATH does not block some ".." escapes, which is categorically false (if it were true, that would be a Linux kernel bug and the kernel actually has an extra safety check at the end of lookup that would render this kind of breakout practically impossible). Reading the code, the actual reason appears to be to provide some consistency between the RESOLVE_BENEATH and fallback per-component O_NOFOLLOW lookups. However, the O_NOFOLLOW resolver can never have full parity with the RESOLVE_BENEATH resolvers because RESOLVE_BENEATH permit symlinks and symlinks can contain ".." components. It seems more prudent to just allow ".." for modern systems and reject it for older ones. If truly necessary, there are even ways to support ".." in a cross-platform way so this could even be seen as a "not-implemented-yet" error rather than a permanent API contract. For the really-old systems case (with no O_NOFOLLOW or AT_DIRFD) then just as we have to accept symlinks (which might contain "..") we might as well accept bare ".." components. Rejecting ".." for that case is just pure security theatre. Fixes: 30656c5e358b ("syscall: add symlink-race-safe do_*_at() wrappers and harden secure_relative_open") Signed-off-by: Aleksa Sarai --- syscall.c | 90 ++++++++++++++-------------------------------- t_secure_relpath.c | 83 ++++++++++++++++++++++++++++++++---------- 2 files changed, 91 insertions(+), 82 deletions(-) diff --git a/syscall.c b/syscall.c index 539952eae..4128d8193 100644 --- a/syscall.c +++ b/syscall.c @@ -1677,34 +1677,8 @@ int do_open_nofollow(const char *pathname, int flags) flag name, picked up by the same #ifdef; flag value differs from FreeBSD) Other systems fall back to the per-component O_NOFOLLOW walk below. - - The relpath must also not contain any ../ elements in the path. */ -/* Returns 1 if path has any "/"-separated component that is exactly - * "..", 0 otherwise. Used by secure_relative_open's front-door - * validation to reject inputs that the per-component walk fallback - * would otherwise resolve through ".." -- e.g. bare "..", "foo/..", - * "subdir/.." -- which RESOLVE_BENEATH-equivalent kernels reject in - * the kernel but the per-component fallback (NetBSD/OpenBSD/Solaris/ - * Cygwin/pre-5.6 Linux) does not. */ -static int path_has_dotdot_component(const char *path) -{ - const char *p = path; - - while (*p) { - const char *q; - if (*p == '/') { p++; continue; } - q = p; - while (*q && *q != '/') - q++; - if (q - p == 2 && p[0] == '.' && p[1] == '.') - return 1; - p = q; - } - return 0; -} - #if defined(__linux__) && defined(HAVE_OPENAT2) static int secure_relative_open_linux(const char *basedir, const char *relpath, int flags, mode_t mode) { @@ -1787,23 +1761,6 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo errno = EINVAL; return -1; } - /* Reject any path with a literal ".." component (bare "..", - * "../foo", "foo/..", "foo/../bar", "subdir/.."). The previous - * substring-based check caught only "../" prefix and "/../" - * substring; bare ".." and trailing "/.." escape on the per- - * component walk fallback used by NetBSD/OpenBSD/Solaris/Cygwin - * and pre-5.6 Linux. RESOLVE_BENEATH on Linux/FreeBSD/macOS - * catches some of these in-kernel with EXDEV, but the front - * door must reject them consistently with EINVAL across all - * platforms so callers can rely on the validation. */ - if (path_has_dotdot_component(relpath)) { - errno = EINVAL; - return -1; - } - if (basedir && basedir[0] != '/' && path_has_dotdot_component(basedir)) { - errno = EINVAL; - return -1; - } #if defined(__linux__) && defined(HAVE_OPENAT2) /* openat2(2) can fail with -EAGAIN if the path contains a ".." component @@ -1833,23 +1790,23 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo pathjoin(fullpath, sizeof fullpath, basedir, relpath); return open(fullpath, flags, mode); #else - int dirfd = AT_FDCWD; + int dirfd = AT_FDCWD, retfd = -1; + char *path_copy = NULL; if (basedir != NULL) { if (basedir[0] == '/') { /* Absolute basedir: operator-trusted, plain openat. */ dirfd = openat(AT_FDCWD, basedir, O_RDONLY | O_DIRECTORY); - if (dirfd == -1) { + if (dirfd == -1) return -1; - } } else { - /* Relative basedir: walk it component-by-component - * with O_NOFOLLOW. This is the per-component - * RESOLVE_BENEATH equivalent for platforms without - * kernel-supported confinement, and matches the - * relpath walk below. Symlinks in basedir are - * rejected outright on this fallback path; the - * Linux openat2 / O_RESOLVE_BENEATH paths above - * still allow within-tree symlinks. */ + /* Relative basedir: walk it component-by-component with + * O_NOFOLLOW. This is the per-component RESOLVE_BENEATH equivalent + * for platforms without kernel-supported confinement, and matches + * the relpath walk below. Symlinks and ".." components in basedir + * are rejected outright on this fallback path; the Linux openat2 / + * O_RESOLVE_BENEATH paths above still allow them as long as they + * stay within the tree. + * */ char *bcopy = my_strdup(basedir, __FILE__, __LINE__); if (!bcopy) return -1; @@ -1857,13 +1814,17 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo part != NULL; part = strtok(NULL, "/")) { + if (!strcmp(part, "..")) { + free(bcopy); + errno = EXDEV; // emulate RESOLVE_BENEATH + goto cleanup; + } int next_fd = openat(dirfd, part, O_RDONLY | O_DIRECTORY | O_NOFOLLOW); if (next_fd == -1) { - int save_errno = errno; - if (dirfd != AT_FDCWD) close(dirfd); + int save_errno = errno; // free only saves errno on glibc >= 2.33 free(bcopy); errno = save_errno; - return -1; + goto cleanup; } if (dirfd != AT_FDCWD) close(dirfd); dirfd = next_fd; @@ -1871,18 +1832,19 @@ int secure_relative_open(const char *basedir, const char *relpath, int flags, mo free(bcopy); } } - int retfd = -1; - char *path_copy = my_strdup(relpath, __FILE__, __LINE__); - if (!path_copy) { - if (dirfd != AT_FDCWD) close(dirfd); - return -1; - } - + path_copy = my_strdup(relpath, __FILE__, __LINE__); + if (!path_copy) + goto cleanup; + for (const char *part = strtok(path_copy, "/"); part != NULL; part = strtok(NULL, "/")) { + if (!strcmp(part, "..")) { + errno = EXDEV; // emulate RESOLVE_BENEATH + goto cleanup; + } int next_fd = openat(dirfd, part, O_RDONLY | O_DIRECTORY | O_NOFOLLOW); if (next_fd == -1 && errno == ENOTDIR) { if (strtok(NULL, "/") != NULL) { diff --git a/t_secure_relpath.c b/t_secure_relpath.c index 99a4a0a5d..4e54b41b6 100644 --- a/t_secure_relpath.c +++ b/t_secure_relpath.c @@ -28,6 +28,11 @@ #include +#ifdef __linux__ +#include +#include +#endif + int dry_run = 0; int am_root = 0; int am_sender = 0; @@ -41,6 +46,40 @@ short info_levels[COUNT_INFO], debug_levels[COUNT_DEBUG]; static int errs = 0; +/* Probe the running kernel for the RESOLVE_BENEATH-equivalent confinement + * that secure_relative_open() prefers over the per-component O_NOFOLLOW + * walk. Returns 1 if either openat2(RESOLVE_BENEATH) on Linux 5.6+ or + * openat(O_RESOLVE_BENEATH) on FreeBSD 13+ / macOS 15+ is honoured by + * the running kernel, 0 otherwise. The probe opens "." (a directory + * the helper has just chdir'd into) so it can't fail for any reason + * other than the kernel rejecting the requested confinement flag. */ +static int kernel_resolve_beneath_supported(void) +{ + int fd; +#if defined __linux__ + struct open_how how; + memset(&how, 0, sizeof how); + how.flags = O_RDONLY | O_DIRECTORY; + how.resolve = RESOLVE_BENEATH | RESOLVE_NO_MAGICLINKS; + fd = syscall(SYS_openat2, AT_FDCWD, ".", &how, sizeof how); + if (fd >= 0) { + close(fd); + return 1; + } + /* O_RESOLVE_BENEATH is not defined on Linux, and even if it were, Linux's + * openat(2) does not return -EINVAL for unknown flag bits and so if + * O_RESOLVE_BENEATH happened to get defined somehow, the following + * fallback would always return success. */ +#elif defined O_RESOLVE_BENEATH + fd = openat(AT_FDCWD, ".", O_RDONLY | O_DIRECTORY | O_RESOLVE_BENEATH); + if (fd >= 0) { + close(fd); + return 1; + } +#endif + return 0; +} + static void check_relative_open(const char *basedir, const char *relpath, int want_errno) { @@ -84,6 +123,10 @@ int main(int argc, char **argv) return 2; } + /* On systems with no RESOLVE_BENEATH, ".." at any point is an error. */ + int contained_dotdot_errno = 0; + if (!kernel_resolve_beneath_supported()) + contained_dotdot_errno = EXDEV; /* secure_relative_open's daemon-only confinement protections only * fire when am_daemon && !am_chrooted (the threat model is the @@ -93,32 +136,36 @@ int main(int argc, char **argv) am_daemon = 1; am_chrooted = 0; + mkdir("ok", 0755); mkdir("subdir", 0755); - - /* Each of these relpaths must be rejected with EINVAL at the - * secure_relative_open() front door. ".." is the actual one-level - * escape; the others ("subdir/..", "subdir/../subdir") resolve - * back to the start dir on systems that allow them, but we still - * reject them as defence-in-depth: a path containing a ".." token - * is suspicious and the caller should normalise before passing - * it in. The "../foo" / "foo/../bar" / "/foo" / "/" cases are - * regression checks for the existing checks. */ - check_relative_open(NULL, "..", EINVAL); - check_relative_open(NULL, "../foo", EINVAL); - check_relative_open(NULL, "subdir/..", EINVAL); - check_relative_open(NULL, "subdir/../subdir", EINVAL); - check_relative_open(NULL, "foo/../bar", EINVAL); + mkdir("subdir/ok", 0755); + mkdir("foo", 0755); + mkdir("foo/ok", 0755); + mkdir("bar", 0755); + mkdir("bar/ok", 0755); + + /* ".." that jumps outside of the root is rejected with EXDEV. */ + check_relative_open(NULL, "..", EXDEV); + check_relative_open(NULL, "../foo", EXDEV); + /* Absolute paths for relpath are rejected with EINVAL. */ check_relative_open(NULL, "/foo", EINVAL); check_relative_open(NULL, "/", EINVAL); + /* If RESOLVE_BENEATH is supported, ".." components that are contained + * within the root are permitted. On older systems, they are rejected with + * EXDEV. */ + check_relative_open(NULL, "subdir/..", contained_dotdot_errno); + check_relative_open(NULL, "subdir/../subdir", contained_dotdot_errno); + check_relative_open(NULL, "foo/../bar", contained_dotdot_errno); /* Same checks against basedir (which the codex Finding 2 fix * routes through the same RESOLVE_BENEATH-equivalent). Absolute * basedirs are operator-trusted and intentionally not validated * here. */ - check_relative_open("..", "ok", EINVAL); - check_relative_open("../subdir", "ok", EINVAL); - check_relative_open("subdir/..", "ok", EINVAL); - check_relative_open("foo/../bar", "ok", EINVAL); + check_relative_open("..", "ok", EXDEV); + check_relative_open("../subdir", "ok", EXDEV); + check_relative_open("subdir/..", "ok", contained_dotdot_errno); + check_relative_open("subdir/../subdir", "ok", contained_dotdot_errno); + check_relative_open("foo/../bar", "ok", contained_dotdot_errno); if (errs) fprintf(stderr, "\n%d failure(s)\n", errs);