diff --git a/mk/tests.mk b/mk/tests.mk index b4fa1c4..0a2a099 100644 --- a/mk/tests.mk +++ b/mk/tests.mk @@ -6,7 +6,8 @@ test-glibc-coreutils test-perf \ test-rosetta-cli test-rosetta-statics test-rosetta-failure-modes \ test-rosetta-alpine test-rosetta-audit test-rosetta-jit \ - test-rosetta-glibc test-rosetta-all bench-rosetta \ + test-rosetta-glibc test-rosetta-madvise test-rosetta-msync \ + test-rosetta-mremap test-rosetta-all bench-rosetta \ test-matrix test-matrix-elfuse-aarch64 test-matrix-qemu-aarch64 \ test-full test-multi-vcpu test-rwx test-sysroot-rename \ test-case-collision test-case-collision-fallback test-getdents64-overlong \ @@ -216,10 +217,20 @@ test-rosetta-jit: $(ELFUSE_BIN) test-rosetta-glibc: $(ELFUSE_BIN) $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-glibc.sh $(ELFUSE_BIN),test-rosetta-glibc) +test-rosetta-madvise: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-madvise.sh $(ELFUSE_BIN),test-rosetta-madvise) + +test-rosetta-msync: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-msync.sh $(ELFUSE_BIN),test-rosetta-msync) + +test-rosetta-mremap: $(ELFUSE_BIN) + $(call RUN_OPTIONAL_SKIP77,bash tests/test-rosetta-mremap.sh $(ELFUSE_BIN),test-rosetta-mremap) + ## Run every Rosetta-specific test target in sequence. test-rosetta-all: test-rosetta-cli test-rosetta-failure-modes \ test-rosetta-statics test-rosetta-alpine \ - test-rosetta-audit test-rosetta-jit test-rosetta-glibc + test-rosetta-audit test-rosetta-jit test-rosetta-glibc \ + test-rosetta-madvise test-rosetta-msync test-rosetta-mremap ## Wall-clock bench harness for x86_64-via-Rosetta workloads. Prints ## best-of-N samples plus the aarch64 reference where available. Set diff --git a/src/syscall/mem.c b/src/syscall/mem.c index 00f93b0..c26105d 100644 --- a/src/syscall/mem.c +++ b/src/syscall/mem.c @@ -495,7 +495,6 @@ static int64_t sys_mmap_high_va(guest_t *g, return -LINUX_ENOMEM; bool is_anon = (flags & LINUX_MAP_ANONYMOUS) != 0; - bool is_shared = (flags & LINUX_MAP_SHARED) != 0; host_fd_ref_t backing_ref = {.fd = -1, .owned = false}; int host_backing_fd = -1; int track_backing_fd = -1; @@ -540,8 +539,16 @@ static int64_t sys_mmap_high_va(guest_t *g, */ bool siblings_quiesced = false; - if (!is_anon && is_shared) - return -LINUX_ENODEV; + /* File-backed MAP_SHARED lands here as a snapshot-style shared region: the + * page contents are pread into fresh high-VA backing below and tracked with + * the region's backing fd, and msync writes dirty bytes back through + * sync_shared_aliases_range (which resolves the region via gpa_base). This + * mirrors how the primary window treats MAP_SHARED file mappings -- + * coherence is msync-driven, not live page-cache -- and unblocks high-VA + * shared file caches such as apt's package lists under Rosetta (issue + * #108). Shared anonymous mappings (MAP_SHARED | MAP_ANONYMOUS, no backing + * fd) already fell through here and are unchanged. + */ /* Reject wrap before reusing addr + length anywhere below. The caller * page-rounds length, but addr is guest-supplied and a huge length against @@ -977,13 +984,21 @@ static int hvf_apply_file_overlay_quiesced(guest_t *g, off_t file_off); static int hvf_remove_file_overlay(guest_t *g, uint64_t ipa, uint64_t len); +/* Copy [file_off, file_off+len) of fd into the guest page backing GPA `gpa`. + * The destination is resolved through host_ptr_for_gpa so both primary-window + * pages (gpa < guest_size -> host_base + gpa) and high-VA pages backed by a + * named mapping or overflow segment land on their real host buffer. Callers + * that stay in the primary window pass a low IPA offset, which equals its own + * GPA there, so their behaviour is unchanged. */ static int read_file_range_to_guest(guest_t *g, - uint64_t guest_off, + uint64_t gpa, int fd, uint64_t file_off, uint64_t len) { - uint8_t *dst = (uint8_t *) g->host_base + guest_off; + uint8_t *dst = host_ptr_for_gpa(g, gpa); + if (!dst) + return -LINUX_EFAULT; size_t remaining = len; while (remaining > 0) { @@ -2600,14 +2615,18 @@ int64_t sys_mremap(guest_t *g, ~(LINUX_MREMAP_MAYMOVE | LINUX_MREMAP_FIXED | LINUX_MREMAP_DONTUNMAP)) return -LINUX_EINVAL; - /* Overflow check on old range. mremap's body shrinks, copies, and zeroes - * via raw host_base+off arithmetic, so the check stays primary-only here - * until the data-movement paths are made region-aware. + /* No primary-window bound on the source: the range is validated against the + * region tracker below (src_reg coverage), so high-VA mmap regions are + * accepted. The shrink/move paths resolve the source through the region's + * gpa_base, and the destination is always allocated in the primary window + * (find_free_gap / mremap_extend_range), so no high-VA destination backing + * is needed. Guard underflow (addr below ipa_base) and old_off + old_size + * wrap explicitly, which the old guest_size bound used to imply. */ - uint64_t old_off = old_addr - g->ipa_base; - if (old_off > g->guest_size) + if (old_addr < g->ipa_base) return -LINUX_EFAULT; - if (old_size > 0 && old_size > g->guest_size - old_off) + uint64_t old_off = old_addr - g->ipa_base; + if (old_size > 0 && old_off > UINT64_MAX - old_size) return -LINUX_EFAULT; /* Reject mremap whose source range touches VM infrastructure (page tables, @@ -2627,6 +2646,15 @@ int64_t sys_mremap(guest_t *g, if (!src_reg || src_reg->end - old_off < old_size) return -LINUX_EFAULT; + /* Capture the source region's GPA layout before any region mutation below + * invalidates src_reg. src_gpa_base + (va_off - src_start) is the backing + * GPA of a source VA-offset; host_ptr_for_gpa turns it into the real host + * pointer (identity for primary regions, overflow/mapping tier for + * high-VA). + */ + uint64_t src_gpa_base = src_reg->gpa_base; + uint64_t src_start = src_reg->start; + /* Same size: nothing to do */ if (old_size == new_size && !(flags & LINUX_MREMAP_FIXED)) return (int64_t) old_addr; @@ -2640,8 +2668,10 @@ int64_t sys_mremap(guest_t *g, int cleanup_err = cleanup_overlays_in_range(g, tail_off, tail_end); if (cleanup_err < 0) return cleanup_err; - /* Zero the trimmed region */ - memset((uint8_t *) g->host_base + tail_off, 0, tail_end - tail_off); + /* Zero the trimmed region on its real backing (high-VA tails live at + * gpa_base, not host_base + tail_off). */ + memset(host_ptr_for_gpa(g, src_gpa_base + (tail_off - src_start)), 0, + tail_end - tail_off); guest_region_remove(g, tail_off, tail_end); guest_invalidate_ptes(g, tail_off, tail_end); if (tail_off < g->mmap_rw_gap_hint) @@ -2840,8 +2870,13 @@ int64_t sys_mremap(guest_t *g, return copy_err; } } else { + /* Read the source through its GPA (identity for primary sources, + * overflow/mapping backing for high-VA). The destination is always + * a fresh primary-window range, so it never overlaps the source and + * the copy direction does not matter. */ memmove((uint8_t *) g->host_base + new_off, - (uint8_t *) g->host_base + old_off, copy_len); + host_ptr_for_gpa(g, src_gpa_base + (old_off - src_start)), + copy_len); } /* Zero any extension beyond old data */ if (new_size > old_size) @@ -2850,7 +2885,8 @@ int64_t sys_mremap(guest_t *g, /* Remove old mapping */ if (old_size > 0) { - memset((uint8_t *) g->host_base + old_off, 0, old_size); + memset(host_ptr_for_gpa(g, src_gpa_base + (old_off - src_start)), 0, + old_size); guest_region_remove(g, old_off, old_off + old_size); guest_invalidate_ptes(g, old_off, old_off + old_size); if (old_off < g->mmap_rw_gap_hint) @@ -3070,8 +3106,12 @@ int64_t sys_mremap(guest_t *g, return copy_err; } } else { + /* Read the source through its GPA so high-VA sources copy from + * their real backing (identity for primary: == host_base + + * old_off). The destination is a fresh primary-window gap. */ memcpy((uint8_t *) g->host_base + new_off, - (uint8_t *) g->host_base + old_off, old_size); + host_ptr_for_gpa(g, src_gpa_base + (old_off - src_start)), + old_size); } memset((uint8_t *) g->host_base + new_off + old_size, 0, new_size - old_size); @@ -3079,7 +3119,8 @@ int64_t sys_mremap(guest_t *g, /* Remove old mapping. Any live source overlay was already torn down * before the destination range was touched. */ - memset((uint8_t *) g->host_base + old_off, 0, old_size); + memset(host_ptr_for_gpa(g, src_gpa_base + (old_off - src_start)), 0, + old_size); guest_region_remove(g, old_off, old_off + old_size); guest_invalidate_ptes(g, old_off, old_off + old_size); if (old_off < g->mmap_rw_gap_hint) @@ -3155,7 +3196,13 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) * guest_is_valid_range once the body is updated. */ uint64_t off = addr - g->ipa_base; - if (off > g->guest_size || length > g->guest_size - off) + /* Accept ranges in the primary IPA window, and also high-VA mmap regions + * (gpa_base != start) that the tracker records as mapped. Rosetta's own + * slab/JIT and guest JITs (e.g. V8) decommit pages in the high-VA window + * via mprotect(PROT_NONE)+madvise(MADV_DONTNEED); rejecting those with + * ENOMEM trips the guest's CHECK_EQ(0, ret) on the madvise return. */ + bool in_primary = (off <= g->guest_size && length <= g->guest_size - off); + if (!in_primary && !madvise_range_mapped(g, off, length)) return -LINUX_ENOMEM; /* Defensive guard against destructive advice on infrastructure ranges (page @@ -3207,13 +3254,22 @@ int64_t sys_madvise(guest_t *g, uint64_t addr, uint64_t length, int advice) uint64_t zstart = (r->start > off) ? r->start : off; uint64_t zend = (r->end < end) ? r->end : end; - memset((uint8_t *) g->host_base + zstart, 0, zend - zstart); + /* High-VA regions back their pages at gpa_base, not at the VA; + * resolve the host pointer through the GPA so the reset hits the + * real backing (host_ptr_for_gpa also follows live overlays). For + * identity regions gpa_base == start, so this is unchanged. */ + uint64_t rgpa = r->gpa_base + (zstart - r->start); + memset(host_ptr_for_gpa(g, rgpa), 0, zend - zstart); if (!(r->flags & LINUX_MAP_ANONYMOUS)) { - /* EOF leaves the tail zero per mmap rules; the helper already - * returns 0 in that case after stopping the read loop. + /* Restore file-backed pages from the current backing image. + * read_file_range_to_guest resolves the destination through + * rgpa, so high-VA file mappings (gpa_base != start) land on + * their real backing rather than being left zero-filled. EOF + * leaves the tail zero per mmap rules; the helper returns 0 in + * that case after stopping the read loop. */ int err = read_file_range_to_guest( - g, zstart, r->backing_fd, r->offset + (zstart - r->start), + g, rgpa, r->backing_fd, r->offset + (zstart - r->start), zend - zstart); if (err < 0) return err; @@ -3599,8 +3655,14 @@ static int64_t sync_shared_aliases_range(guest_t *g, if (wfile_start >= wfile_end) continue; - const uint8_t *guest = (const uint8_t *) g->host_base + src->start + - (wfile_start - src->offset); + /* Resolve the guest bytes through the region's GPA so high-VA + * shared mappings (gpa_base != start) read from their real backing + * rather than host_base + start. For identity regions gpa_base == + * start, so this is unchanged. */ + const uint8_t *guest = host_ptr_for_gpa( + g, src->gpa_base + (wfile_start - src->offset)); + if (!guest) + return -LINUX_EFAULT; size_t offset = (size_t) (wfile_start - chunk_start); size_t len = (size_t) (wfile_end - wfile_start); @@ -3640,9 +3702,13 @@ static int64_t refresh_shared_region_range(guest_t *g, if (rfile_start >= rfile_end) return 0; - uint64_t guest_off = r->start + (rfile_start - r->offset); uint64_t len = rfile_end - rfile_start, file_off = rfile_start; - uint8_t *buf = (uint8_t *) g->host_base + guest_off; + /* Resolve through the region's GPA so high-VA shared mappings refresh their + * real backing rather than host_base + start. Identity regions have + * gpa_base == start, so this is unchanged. */ + uint8_t *buf = host_ptr_for_gpa(g, r->gpa_base + (rfile_start - r->offset)); + if (!buf) + return -LINUX_EFAULT; while (len > 0) { size_t chunk = @@ -3684,13 +3750,15 @@ int64_t sys_msync(guest_t *g, uint64_t addr, uint64_t length, int flags) return -LINUX_ENOMEM; uint64_t off = addr - g->ipa_base; - /* sys_msync stays primary-only here for the same reason as madvise: msync - * iterates regions and reaches into host_base+off to read pages back from - * the file overlay. Widening to extra-region ranges needs a region-aware - * iterator landing alongside the data-movement refactor. + /* Admit any range the region tracker fully covers, primary-window or + * high-VA. Both data-movement helpers (sync_shared_aliases_range and + * refresh_shared_region_range) now resolve their host pointers through the + * region's gpa_base via host_ptr_for_gpa, so extra-region ranges -- e.g. + * Rosetta's high-VA MAP_SHARED file caches that apt msyncs -- act on their + * real backing instead of being rejected with -ENOMEM. The coverage loop + * below still rejects unmapped holes. off + length cannot overflow here: + * addr > UINT64_MAX - length was rejected above and off <= addr. */ - if (off > g->guest_size || length > g->guest_size - off) - return -LINUX_ENOMEM; uint64_t end = off + length; uint64_t cursor = off; diff --git a/tests/fixtures/rosetta/README.md b/tests/fixtures/rosetta/README.md index 8a73e1a..8269c86 100644 --- a/tests/fixtures/rosetta/README.md +++ b/tests/fixtures/rosetta/README.md @@ -4,6 +4,15 @@ Rosetta x86_64 test fixtures vendored for self-contained matrix coverage. - static x86_64 Linux ELF built from `tests/x86_64-rosetta-audit.c` - `x86_64-rosetta-tls0` - static x86_64 Linux ELF built from `tests/x86_64-rosetta-tls0.c` +- `x86_64-rosetta-madvise` + - static x86_64 Linux ELF built from `tests/x86_64-rosetta-madvise.c` + - used by `tests/test-rosetta-madvise.sh` +- `x86_64-rosetta-msync` + - static x86_64 Linux ELF built from `tests/x86_64-rosetta-msync.c` + - used by `tests/test-rosetta-msync.sh` +- `x86_64-rosetta-mremap` + - static x86_64 Linux ELF built from `tests/x86_64-rosetta-mremap.c` + - used by `tests/test-rosetta-mremap.sh` - `x86_64-glibc-rootfs.tar.gz` - minimal x86_64 glibc rootfs used by `tests/test-rosetta-glibc.sh` - contains `hello-dynamic`, `dlopen-probe`, `tls-probe`, @@ -34,8 +43,11 @@ gcc -O2 -o tls-probe tests/x86_64-glibc-tls.c gcc -O2 -fPIC -shared -o libgdtls.so tests/x86_64-glibc-gdtls-lib.c gcc -O2 -ldl -o gdtls-probe tests/x86_64-glibc-gdtls.c gcc -O2 -pthread -o pthread-tls-probe tests/x86_64-glibc-pthread-tls.c -gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c -gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c +gcc -O2 -static -o x86_64-rosetta-audit tests/x86_64-rosetta-audit.c +gcc -O2 -static -o x86_64-rosetta-tls0 tests/x86_64-rosetta-tls0.c +gcc -O2 -static -o x86_64-rosetta-madvise tests/x86_64-rosetta-madvise.c +gcc -O2 -static -o x86_64-rosetta-msync tests/x86_64-rosetta-msync.c +gcc -O2 -static -o x86_64-rosetta-mremap tests/x86_64-rosetta-mremap.c # Stage the matching ld.so / libc.so.6 / libm.so.6 from the same host # into a rootfs/ tree alongside libgdtls.so under lib/x86_64-linux-gnu/, # then tar -czf x86_64-glibc-rootfs.tar.gz rootfs/. diff --git a/tests/fixtures/rosetta/x86_64-rosetta-madvise b/tests/fixtures/rosetta/x86_64-rosetta-madvise new file mode 100755 index 0000000..5dbb61c Binary files /dev/null and b/tests/fixtures/rosetta/x86_64-rosetta-madvise differ diff --git a/tests/fixtures/rosetta/x86_64-rosetta-mremap b/tests/fixtures/rosetta/x86_64-rosetta-mremap new file mode 100755 index 0000000..6a50c4d Binary files /dev/null and b/tests/fixtures/rosetta/x86_64-rosetta-mremap differ diff --git a/tests/fixtures/rosetta/x86_64-rosetta-msync b/tests/fixtures/rosetta/x86_64-rosetta-msync new file mode 100755 index 0000000..ef293e8 Binary files /dev/null and b/tests/fixtures/rosetta/x86_64-rosetta-msync differ diff --git a/tests/test-matrix.sh b/tests/test-matrix.sh index 2611d04..0c46d7f 100755 --- a/tests/test-matrix.sh +++ b/tests/test-matrix.sh @@ -613,10 +613,22 @@ run_rosetta_x86_64_suites() printf "\nRosetta glibc dynamic\n" run_summary_suite "rosetta-glibc" \ bash "${REPO_ROOT}/tests/test-rosetta-glibc.sh" "$ELFUSE" || rc=1 + + printf "\nRosetta high-VA madvise\n" + run_summary_suite "rosetta-madvise" \ + bash "${REPO_ROOT}/tests/test-rosetta-madvise.sh" "$ELFUSE" || rc=1 + + printf "\nRosetta high-VA msync\n" + run_summary_suite "rosetta-msync" \ + bash "${REPO_ROOT}/tests/test-rosetta-msync.sh" "$ELFUSE" || rc=1 + + printf "\nRosetta high-VA mremap\n" + run_summary_suite "rosetta-mremap" \ + bash "${REPO_ROOT}/tests/test-rosetta-mremap.sh" "$ELFUSE" || rc=1 else local suite for suite in rosetta-statics rosetta-alpine rosetta-audit rosetta-jit \ - rosetta-glibc; do + rosetta-glibc rosetta-madvise rosetta-msync rosetta-mremap; do skip_suite "$suite" "Rosetta translator not installed" done fi diff --git a/tests/test-rosetta-madvise.sh b/tests/test-rosetta-madvise.sh new file mode 100644 index 0000000..281a6f1 --- /dev/null +++ b/tests/test-rosetta-madvise.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# test-rosetta-madvise.sh - madvise(MADV_DONTNEED) on high-VA regions via Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Regression for elfuse sys_madvise rejecting high-VA mmap regions with ENOMEM. +# Under Rosetta, anonymous mmap(NULL) lands in the high-VA window where +# sys_madvise was primary-window-only and returned ENOMEM for every +# MADV_DONTNEED. V8's page allocator decommits guard/code pages with +# mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the result, +# so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT initialized. +# +# Fixture: tests/fixtures/rosetta/x86_64-rosetta-madvise (vendored x86_64 ELF). +# +# Usage: tests/test-rosetta-madvise.sh [path/to/elfuse] + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" +MADV_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-madvise" + +# shellcheck source=tests/lib/rosetta-test.sh +. "$(dirname "$0")/lib/rosetta-test.sh" + +pass=0 +fail=0 +skip=0 +total=0 + +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 + exit 77 +fi +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi + +require_timeout + +if [ ! -x "$MADV_BIN" ]; then + printf 'vendored Rosetta madvise fixture missing under tests/fixtures/rosetta/\n' >&2 + exit 77 +fi + +total=$((total + 1)) +set +e +madv_out="$("$TIMEOUT" 30 "$ELFUSE" "$MADV_BIN" 2>&1)" +madv_rc=$? +set -e +if [ "$madv_rc" -eq 0 ] && + printf '%s\n' "$madv_out" | grep -q 'madvise high-VA: all subtests passed'; then + report_pass "madvise-high-va-dontneed" +else + report_fail "madvise-high-va-dontneed: rc=$madv_rc" + printf '%s\n' "$madv_out" >&2 +fi + +report_summary "$total" diff --git a/tests/test-rosetta-mremap.sh b/tests/test-rosetta-mremap.sh new file mode 100644 index 0000000..2c9ea8b --- /dev/null +++ b/tests/test-rosetta-mremap.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +# test-rosetta-mremap.sh - mremap() on high-VA source regions via Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Regression for elfuse sys_mremap rejecting high-VA mmap regions with EFAULT +# and reading/zeroing the source through raw host_base + off. Under Rosetta, +# mmap(NULL) lands in the high-VA window; the fix admits high-VA sources and +# resolves the source through the region's gpa_base, so mremap(MAYMOVE) of a +# high-VA region relocates it into the primary window with its bytes intact and +# an in-place shrink preserves the retained head. +# +# Fixture: tests/fixtures/rosetta/x86_64-rosetta-mremap (vendored x86_64 ELF). +# +# Usage: tests/test-rosetta-mremap.sh [path/to/elfuse] + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" +MREMAP_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-mremap" + +# shellcheck source=tests/lib/rosetta-test.sh +. "$(dirname "$0")/lib/rosetta-test.sh" + +pass=0 +fail=0 +skip=0 +total=0 + +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 + exit 77 +fi +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi + +require_timeout + +if [ ! -x "$MREMAP_BIN" ]; then + printf 'vendored Rosetta mremap fixture missing under tests/fixtures/rosetta/\n' >&2 + exit 77 +fi + +total=$((total + 1)) +set +e +mremap_out="$("$TIMEOUT" 30 "$ELFUSE" "$MREMAP_BIN" 2>&1)" +mremap_rc=$? +set -e +if [ "$mremap_rc" -eq 0 ] && + printf '%s\n' "$mremap_out" | grep -q 'mremap high-VA: all subtests passed'; then + report_pass "mremap-high-va" +else + report_fail "mremap-high-va: rc=$mremap_rc" + printf '%s\n' "$mremap_out" >&2 +fi + +report_summary "$total" +if [ "$fail" -gt 0 ]; then + exit 1 +fi diff --git a/tests/test-rosetta-msync.sh b/tests/test-rosetta-msync.sh new file mode 100644 index 0000000..ab52001 --- /dev/null +++ b/tests/test-rosetta-msync.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +# test-rosetta-msync.sh - msync(MS_SYNC/MS_ASYNC) on high-VA regions via Rosetta +# +# Copyright 2026 elfuse contributors +# SPDX-License-Identifier: Apache-2.0 +# +# Regression for elfuse sys_msync rejecting high-VA mmap regions with ENOMEM +# (issue #108). Under Rosetta, a file-backed MAP_SHARED mmap(NULL) lands in the +# high-VA window where sys_msync was primary-window-only and returned ENOMEM for +# every msync, and sys_mmap_high_va rejected the mapping outright with ENODEV. +# apt msyncs its MAP_SHARED package-list cache, so under an x86_64 guest this +# surfaced as "Unable to synchronize mmap - msync (12: Cannot allocate memory)" +# and aborted `apt update` / `apt upgrade`. +# +# Fixture: tests/fixtures/rosetta/x86_64-rosetta-msync (vendored x86_64 ELF). +# +# Usage: tests/test-rosetta-msync.sh [path/to/elfuse] + +set -euo pipefail + +ELFUSE_INPUT="${1:-build/elfuse}" +case "$ELFUSE_INPUT" in + /*) ELFUSE="$ELFUSE_INPUT" ;; + *) ELFUSE="$(pwd)/$ELFUSE_INPUT" ;; +esac + +ROSETTA_PATH="${MATRIX_ROSETTA_TRANSLATOR:-/Library/Apple/usr/libexec/oah/RosettaLinux/rosetta}" +MSYNC_BIN="$(pwd)/tests/fixtures/rosetta/x86_64-rosetta-msync" + +# shellcheck source=tests/lib/rosetta-test.sh +. "$(dirname "$0")/lib/rosetta-test.sh" + +pass=0 +fail=0 +skip=0 +total=0 + +if [ ! -x "$ROSETTA_PATH" ]; then + printf 'rosetta translator not found at %s\n' "$ROSETTA_PATH" >&2 + exit 77 +fi +if [ ! -x "$ELFUSE" ]; then + printf 'elfuse binary not found: %s\n' "$ELFUSE" >&2 + exit 1 +fi + +require_timeout + +if [ ! -x "$MSYNC_BIN" ]; then + printf 'vendored Rosetta msync fixture missing under tests/fixtures/rosetta/\n' >&2 + exit 77 +fi + +total=$((total + 1)) +set +e +msync_out="$("$TIMEOUT" 30 "$ELFUSE" "$MSYNC_BIN" 2>&1)" +msync_rc=$? +set -e +if [ "$msync_rc" -eq 0 ] && + printf '%s\n' "$msync_out" | grep -q 'msync high-VA: all subtests passed'; then + report_pass "msync-high-va-writeback" +else + report_fail "msync-high-va-writeback: rc=$msync_rc" + printf '%s\n' "$msync_out" >&2 +fi + +report_summary "$total" +if [ "$fail" -gt 0 ]; then + exit 1 +fi diff --git a/tests/x86_64-rosetta-madvise.c b/tests/x86_64-rosetta-madvise.c new file mode 100644 index 0000000..f9b98df --- /dev/null +++ b/tests/x86_64-rosetta-madvise.c @@ -0,0 +1,185 @@ +/* x86_64-rosetta-madvise.c - madvise(MADV_DONTNEED) on high-VA regions + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression for elfuse sys_madvise rejecting high-VA mmap regions with + * ENOMEM. Under Rosetta, anonymous mmap(NULL) lands in the high-VA window + * (the region's gpa_base diverges from its VA start), where sys_madvise was + * primary-window-only: it computed off = addr - ipa_base and rejected any + * range past guest_size with ENOMEM, even though sys_mprotect already handles + * the same high-VA range. V8's page allocator decommits guard/code pages with + * mprotect(PROT_NONE)+madvise(MADV_DONTNEED) and CHECK_EQ(0, ret)s the madvise + * return, so the spurious ENOMEM aborted x86_64 Node.js the moment its JIT + * initialized. + * + * Each subtest prints "PASS " / "FAIL "; main() exits non-zero on + * any failure so the shell harness can gate on the exit code. + * + * This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not + * built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and + * re-vendor per tests/fixtures/rosetta/README.md. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef MADV_DONTNEED +#define MADV_DONTNEED 4 +#endif + +#define PAGE ((size_t) 4096) + +static int fails; + +/* The primary IPA window is a handful of GiB; Rosetta places guest mappings at + * their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA + * window that exercises the regression. */ +static int is_high_va(const void *p) +{ + return (uint64_t) (uintptr_t) p > 0x100000000ULL; +} + +/* MADV_DONTNEED on a writable high-VA page returns 0 and zero-fills. */ +static void test_dontneed_rw(void) +{ + void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-rw: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-rw: mapping not in high-VA window (%p)\n", p); + fails++; + munmap(p, PAGE); + return; + } + memset(p, 0xAA, PAGE); + errno = 0; + if (madvise(p, PAGE, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-rw: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, PAGE); + return; + } + for (unsigned i = 0; i < PAGE; i++) { + if (((unsigned char *) p)[i] != 0) { + printf("FAIL dontneed-rw: byte %u not zeroed\n", i); + fails++; + munmap(p, PAGE); + return; + } + } + printf("PASS dontneed-rw\n"); + munmap(p, PAGE); +} + +/* The exact V8 decommit pattern: a guard page is set PROT_NONE and then + * MADV_DONTNEED'd. Linux returns 0 for a mapped-but-PROT_NONE page; after + * re-granting RW the page reads back as zero. */ +static void test_dontneed_protnone(void) +{ + size_t sz = 2u * PAGE; + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-protnone: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-protnone: mapping not in high-VA window (%p)\n", + p); + fails++; + munmap(p, sz); + return; + } + void *guard = (char *) p + PAGE; + memset(p, 0xBB, sz); + if (mprotect(guard, PAGE, PROT_NONE) != 0) { + printf("FAIL dontneed-protnone: mprotect PROT_NONE errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + errno = 0; + if (madvise(guard, PAGE, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-protnone: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + if (mprotect(guard, PAGE, PROT_READ | PROT_WRITE) != 0) { + printf("FAIL dontneed-protnone: re-grant RW errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + for (unsigned i = 0; i < PAGE; i++) { + if (((unsigned char *) guard)[i] != 0) { + printf("FAIL dontneed-protnone: guard byte %u not zeroed\n", i); + fails++; + munmap(p, sz); + return; + } + } + printf("PASS dontneed-protnone\n"); + munmap(p, sz); +} + +/* Multi-page MADV_DONTNEED across a high-VA span returns 0 and zero-fills. */ +static void test_dontneed_multi(void) +{ + size_t sz = 16u * PAGE; + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL dontneed-multi: mmap errno=%d\n", errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL dontneed-multi: mapping not in high-VA window (%p)\n", p); + fails++; + munmap(p, sz); + return; + } + memset(p, 0xCC, sz); + errno = 0; + if (madvise(p, sz, MADV_DONTNEED) != 0) { + printf("FAIL dontneed-multi: madvise rc=-1 errno=%d\n", errno); + fails++; + munmap(p, sz); + return; + } + for (size_t i = 0; i < sz; i++) { + if (((unsigned char *) p)[i] != 0) { + printf("FAIL dontneed-multi: byte %zu not zeroed\n", i); + fails++; + munmap(p, sz); + return; + } + } + printf("PASS dontneed-multi\n"); + munmap(p, sz); +} + +int main(void) +{ + test_dontneed_rw(); + test_dontneed_protnone(); + test_dontneed_multi(); + + if (fails) { + printf("madvise high-VA: %d subtest(s) failed\n", fails); + return 1; + } + printf("madvise high-VA: all subtests passed\n"); + return 0; +} diff --git a/tests/x86_64-rosetta-mremap.c b/tests/x86_64-rosetta-mremap.c new file mode 100644 index 0000000..1276e02 --- /dev/null +++ b/tests/x86_64-rosetta-mremap.c @@ -0,0 +1,161 @@ +/* x86_64-rosetta-mremap.c - mremap() on high-VA source regions + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression for elfuse sys_mremap rejecting high-VA mmap regions. Under + * Rosetta, mmap(NULL) lands in the high-VA window; sys_mremap was + * primary-window-only (off = addr - ipa_base rejected past guest_size with + * EFAULT) and its shrink / move paths read and zeroed the source through raw + * host_base + off, which only resolves for identity regions. The fix admits + * high-VA sources and resolves the source through the region's gpa_base + * (host_ptr_for_gpa); the destination is still allocated in the primary window, + * so an mremap(MAYMOVE) of a high-VA region relocates it there with its + * contents intact. + * + * Each subtest prints "PASS " / "FAIL "; main() exits non-zero on + * any failure so the shell harness can gate on the exit code. + * + * This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not + * built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and + * re-vendor per tests/fixtures/rosetta/README.md. + */ + +#define _GNU_SOURCE /* mremap */ +#include +#include +#include +#include +#include +#include + +#ifndef MREMAP_MAYMOVE +#define MREMAP_MAYMOVE 1 +#endif + +#define PAGE ((size_t) 4096) + +static int fails; + +/* The primary IPA window is a handful of GiB; Rosetta places guest mappings at + * their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA + * window that exercises the regression. */ +static int is_high_va(const void *p) +{ + return (uint64_t) (uintptr_t) p > 0x100000000ULL; +} + +static int all_equal(const void *p, + size_t n, + unsigned char want, + const char *tag) +{ + const unsigned char *b = p; + for (size_t i = 0; i < n; i++) { + if (b[i] != want) { + printf("FAIL %s: byte %zu = 0x%02x, want 0x%02x\n", tag, i, b[i], + want); + return 0; + } + } + return 1; +} + +/* MREMAP_MAYMOVE growing a high-VA anonymous region relocates it (to the + * primary window) with the original bytes preserved and the extension zeroed. + */ +static void test_mremap_maymove_grow(void) +{ + const char *tag = "mremap-maymove-grow"; + size_t osz = 4u * PAGE, nsz = 8u * PAGE; + void *p = mmap(NULL, osz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL %s: mmap errno=%d\n", tag, errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL %s: source not in high-VA window (%p)\n", tag, p); + fails++; + munmap(p, osz); + return; + } + memset(p, 0x5A, osz); + errno = 0; + void *q = mremap(p, osz, nsz, MREMAP_MAYMOVE); + if (q == MAP_FAILED) { + printf("FAIL %s: mremap rc=-1 errno=%d\n", tag, errno); + fails++; + munmap(p, osz); + return; + } + if (!all_equal(q, osz, 0x5A, tag)) { + fails++; + munmap(q, nsz); + return; + } + if (!all_equal((char *) q + osz, nsz - osz, 0x00, tag)) { + fails++; + munmap(q, nsz); + return; + } + printf("PASS %s\n", tag); + munmap(q, nsz); +} + +/* Shrinking a high-VA region in place keeps its base address and preserves the + * retained head; only the trimmed tail is released. */ +static void test_mremap_shrink(void) +{ + const char *tag = "mremap-shrink"; + size_t osz = 8u * PAGE, nsz = 4u * PAGE; + void *p = mmap(NULL, osz, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + printf("FAIL %s: mmap errno=%d\n", tag, errno); + fails++; + return; + } + if (!is_high_va(p)) { + printf("FAIL %s: source not in high-VA window (%p)\n", tag, p); + fails++; + munmap(p, osz); + return; + } + memset(p, 0x3C, osz); + errno = 0; + void *q = mremap(p, osz, nsz, 0); + if (q == MAP_FAILED) { + printf("FAIL %s: mremap rc=-1 errno=%d\n", tag, errno); + fails++; + munmap(p, osz); + return; + } + if (q != p) { + printf("FAIL %s: in-place shrink moved %p -> %p\n", tag, p, q); + fails++; + munmap(q, nsz); + return; + } + if (!all_equal(q, nsz, 0x3C, tag)) { + fails++; + munmap(q, nsz); + return; + } + printf("PASS %s\n", tag); + munmap(q, nsz); +} + +int main(void) +{ + test_mremap_maymove_grow(); + test_mremap_shrink(); + + if (fails) { + printf("mremap high-VA: %d subtest(s) failed\n", fails); + return 1; + } + printf("mremap high-VA: all subtests passed\n"); + return 0; +} diff --git a/tests/x86_64-rosetta-msync.c b/tests/x86_64-rosetta-msync.c new file mode 100644 index 0000000..d52900b --- /dev/null +++ b/tests/x86_64-rosetta-msync.c @@ -0,0 +1,242 @@ +/* x86_64-rosetta-msync.c - msync(MS_SYNC/MS_ASYNC) on high-VA regions + * + * Copyright 2026 elfuse contributors + * SPDX-License-Identifier: Apache-2.0 + * + * Regression for elfuse sys_msync rejecting high-VA mmap regions with ENOMEM + * (issue #108). Under Rosetta, a file-backed MAP_SHARED mmap(NULL) lands in the + * high-VA window (the region's gpa_base diverges from its VA start), where + * sys_msync was primary-window-only: it computed off = addr - ipa_base and + * rejected any range past guest_size with ENOMEM. apt memory-maps its package + * lists with MAP_SHARED and periodically msyncs them, so under an x86_64 guest + * the spurious ENOMEM surfaced as: + * E: Unable to synchronize mmap - msync (12: Cannot allocate memory) + * and aborted `apt update` / `apt upgrade` at package-list parsing. + * + * The fix admits high-VA ranges the region tracker records as mapped and + * resolves the write-back / refresh host pointers through host_ptr_for_gpa + * (gpa_base + ...) instead of host_base + start, so a high-VA MAP_SHARED file + * mapping msyncs its dirty bytes to the backing file instead of failing. + * + * Each subtest prints "PASS " / "FAIL "; main() exits non-zero on + * any failure so the shell harness can gate on the exit code. + * + * This is an x86_64 Linux static ELF, run through elfuse + Rosetta. It is not + * built in-tree (the Makefile builds aarch64 hosts); rebuild out of tree and + * re-vendor per tests/fixtures/rosetta/README.md. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE ((size_t) 4096) + +static int fails; + +/* The primary IPA window is a handful of GiB; Rosetta places guest mappings at + * their native x86_64 VAs far above it. Anything past 4 GiB is the high-VA + * window that exercises the regression. */ +static int is_high_va(const void *p) +{ + return (uint64_t) (uintptr_t) p > 0x100000000ULL; +} + +/* Create a zero-filled backing file of `sz` bytes, returning an open O_RDWR fd + * (unlinked so it cleans up on close). -1 on error. + * + * Backed by /dev/shm, not host-passthrough /tmp: elfuse refuses mmap on + * FUSE-served fds (ENODEV), and /dev/shm is the mmap-able tmpfs the aarch64 + * test-msync uses too. `slot` disambiguates concurrent subtests. glibc static + * shm_open is unreliable, so open the path directly. */ +static int make_backing(size_t sz, int slot) +{ + char path[64]; + snprintf(path, sizeof(path), "/dev/shm/elfuse-rosetta-msync-%d-%d", + (int) getpid(), slot); + int fd = open(path, O_CREAT | O_EXCL | O_RDWR, 0600); + if (fd < 0) + return -1; + unlink(path); + if (ftruncate(fd, (off_t) sz) != 0) { + close(fd); + return -1; + } + return fd; +} + +/* Read `sz` bytes at file offset 0 through a fresh fd (so the check reflects + * what actually reached the backing file, not the mapping) and confirm every + * byte equals `want`. */ +static int file_all_equal(int fd, + size_t sz, + unsigned char want, + const char *tag) +{ + for (size_t off = 0; off < sz;) { + unsigned char buf[PAGE]; + size_t n = sz - off < sizeof(buf) ? sz - off : sizeof(buf); + ssize_t nr = pread(fd, buf, n, (off_t) off); + if (nr <= 0) { + printf("FAIL %s: pread rc=%zd errno=%d\n", tag, nr, errno); + return 0; + } + for (ssize_t i = 0; i < nr; i++) { + if (buf[i] != want) { + printf("FAIL %s: byte %zu = 0x%02x, want 0x%02x\n", tag, + off + (size_t) i, buf[i], want); + return 0; + } + } + off += (size_t) nr; + } + return 1; +} + +/* MS_SYNC on a single writable high-VA MAP_SHARED page returns 0 and the write + * reaches the backing file. */ +static void test_msync_writeback_single(void) +{ + const char *tag = "msync-writeback-single"; + int fd = make_backing(PAGE, 0); + if (fd < 0) { + printf("FAIL %s: make_backing errno=%d\n", tag, errno); + fails++; + return; + } + void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + printf("FAIL %s: mmap errno=%d\n", tag, errno); + fails++; + close(fd); + return; + } + if (!is_high_va(p)) { + printf("FAIL %s: mapping not in high-VA window (%p)\n", tag, p); + fails++; + munmap(p, PAGE); + close(fd); + return; + } + memset(p, 0x5A, PAGE); + errno = 0; + if (msync(p, PAGE, MS_SYNC) != 0) { + printf("FAIL %s: msync rc=-1 errno=%d\n", tag, errno); + fails++; + munmap(p, PAGE); + close(fd); + return; + } + if (!file_all_equal(fd, PAGE, 0x5A, tag)) { + fails++; + munmap(p, PAGE); + close(fd); + return; + } + printf("PASS %s\n", tag); + munmap(p, PAGE); + close(fd); +} + +/* MS_SYNC across a multi-page high-VA span returns 0 and writes back fully. */ +static void test_msync_writeback_multi(void) +{ + const char *tag = "msync-writeback-multi"; + size_t sz = 16u * PAGE; + int fd = make_backing(sz, 1); + if (fd < 0) { + printf("FAIL %s: make_backing errno=%d\n", tag, errno); + fails++; + return; + } + void *p = mmap(NULL, sz, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + printf("FAIL %s: mmap errno=%d\n", tag, errno); + fails++; + close(fd); + return; + } + if (!is_high_va(p)) { + printf("FAIL %s: mapping not in high-VA window (%p)\n", tag, p); + fails++; + munmap(p, sz); + close(fd); + return; + } + memset(p, 0x3C, sz); + errno = 0; + if (msync(p, sz, MS_SYNC) != 0) { + printf("FAIL %s: msync rc=-1 errno=%d\n", tag, errno); + fails++; + munmap(p, sz); + close(fd); + return; + } + if (!file_all_equal(fd, sz, 0x3C, tag)) { + fails++; + munmap(p, sz); + close(fd); + return; + } + printf("PASS %s\n", tag); + munmap(p, sz); + close(fd); +} + +/* MS_ASYNC on a high-VA mapping returns 0 (the primary #108 symptom is the + * -ENOMEM admission failure, which is flag-independent). */ +static void test_msync_async(void) +{ + const char *tag = "msync-async"; + int fd = make_backing(PAGE, 2); + if (fd < 0) { + printf("FAIL %s: make_backing errno=%d\n", tag, errno); + fails++; + return; + } + void *p = mmap(NULL, PAGE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + printf("FAIL %s: mmap errno=%d\n", tag, errno); + fails++; + close(fd); + return; + } + if (!is_high_va(p)) { + printf("FAIL %s: mapping not in high-VA window (%p)\n", tag, p); + fails++; + munmap(p, PAGE); + close(fd); + return; + } + memset(p, 0x77, PAGE); + errno = 0; + if (msync(p, PAGE, MS_ASYNC) != 0) { + printf("FAIL %s: msync rc=-1 errno=%d\n", tag, errno); + fails++; + munmap(p, PAGE); + close(fd); + return; + } + printf("PASS %s\n", tag); + munmap(p, PAGE); + close(fd); +} + +int main(void) +{ + test_msync_writeback_single(); + test_msync_writeback_multi(); + test_msync_async(); + + if (fails) { + printf("msync high-VA: %d subtest(s) failed\n", fails); + return 1; + } + printf("msync high-VA: all subtests passed\n"); + return 0; +}