diff --git a/Justfile b/Justfile index 8b0578c7f..aed410dcc 100644 --- a/Justfile +++ b/Justfile @@ -268,7 +268,7 @@ test-rust-tracing target=default-target features="": check-i686 target=default-target: cargo check -p hyperlight-common --target i686-unknown-linux-gnu --profile={{ if target == "debug" { "dev" } else { target } }} cargo check -p hyperlight-guest --target i686-unknown-linux-gnu --profile={{ if target == "debug" { "dev" } else { target } }} - cargo check -p hyperlight-common --target i686-unknown-linux-gnu --features nanvix-unstable --profile={{ if target == "debug" { "dev" } else { target } }} + cargo check -p hyperlight-common --target i686-unknown-linux-gnu --features i686-guest --profile={{ if target == "debug" { "dev" } else { target } }} # Verify that trace_guest correctly fails on i686 (compile_error should trigger) ! cargo check -p hyperlight-guest --target i686-unknown-linux-gnu --features trace_guest --profile={{ if target == "debug" { "dev" } else { target } }} 2>/dev/null @@ -291,8 +291,8 @@ check: {{ cargo-cmd }} check -p hyperlight-host --features print_debug {{ target-triple-flag }} {{ cargo-cmd }} check -p hyperlight-host --features gdb {{ target-triple-flag }} {{ cargo-cmd }} check -p hyperlight-host --features trace_guest,mem_profile {{ target-triple-flag }} - {{ cargo-cmd }} check -p hyperlight-host --features nanvix-unstable {{ target-triple-flag }} - {{ cargo-cmd }} check -p hyperlight-host --features nanvix-unstable,executable_heap {{ target-triple-flag }} + {{ cargo-cmd }} check -p hyperlight-host --features i686-guest {{ target-triple-flag }} + {{ cargo-cmd }} check -p hyperlight-host --features i686-guest,executable_heap {{ target-triple-flag }} {{ cargo-cmd }} check -p hyperlight-host --features hw-interrupts {{ target-triple-flag }} fmt-check: (ensure-nightly-fmt) diff --git a/src/hyperlight_common/Cargo.toml b/src/hyperlight_common/Cargo.toml index 68ebcae71..e568abac1 100644 --- a/src/hyperlight_common/Cargo.toml +++ b/src/hyperlight_common/Cargo.toml @@ -31,7 +31,9 @@ fuzzing = ["dep:arbitrary"] trace_guest = [] mem_profile = [] std = ["thiserror/std", "log/std", "tracing/std"] -nanvix-unstable = [] +i686-guest = [] +nanvix-unstable = ["i686-guest"] +guest-counter = [] [lib] bench = false # see https://bheisler.github.io/criterion.rs/book/faq.html#cargo-bench-gives-unrecognized-option-errors-for-valid-command-line-options diff --git a/src/hyperlight_common/src/arch/amd64/vmem.rs b/src/hyperlight_common/src/arch/amd64/vmem.rs index 9505dda16..a90314646 100644 --- a/src/hyperlight_common/src/arch/amd64/vmem.rs +++ b/src/hyperlight_common/src/arch/amd64/vmem.rs @@ -596,6 +596,72 @@ pub type PageTableEntry = u64; pub type VirtAddr = u64; pub type PhysAddr = u64; +/// i686 guest page-table walker and PTE constants for the x86_64 host. +/// +/// When the host builds with `i686-guest`, it needs to walk 2-level i686 +/// page tables in guest memory. The `arch/i686/vmem.rs` module only compiles +/// for `target_arch = "x86"` (the guest side), so the host-side walker lives +/// here, gated behind the feature flag. +#[cfg(feature = "i686-guest")] +pub mod i686_guest { + use alloc::vec::Vec; + + use crate::vmem::{BasicMapping, CowMapping, Mapping, MappingKind, TableReadOps}; + + pub const PAGE_PRESENT: u64 = 1; + pub const PAGE_RW: u64 = 1 << 1; + pub const PAGE_USER: u64 = 1 << 2; + pub const PAGE_ACCESSED: u64 = 1 << 5; + pub const PAGE_AVL_COW: u64 = 1 << 9; + pub const PTE_ADDR_MASK: u64 = 0xFFFFF000; + + /// Walk an i686 2-level page table and return all present mappings. + /// + /// # Safety + /// The caller must ensure that `op` provides valid page table memory. + pub unsafe fn virt_to_phys_all(op: &Op) -> Vec { + let root = op.root_table(); + let mut mappings = Vec::new(); + for pdi in 0..1024u64 { + let pde_ptr = Op::entry_addr(root, pdi * 4); + let pde: u64 = unsafe { op.read_entry(pde_ptr) }; + if (pde & PAGE_PRESENT) == 0 { + continue; + } + let pt_phys = pde & PTE_ADDR_MASK; + let pt_base = Op::from_phys(pt_phys as crate::vmem::PhysAddr); + for pti in 0..1024u64 { + let pte_ptr = Op::entry_addr(pt_base, pti * 4); + let pte: u64 = unsafe { op.read_entry(pte_ptr) }; + if (pte & PAGE_PRESENT) == 0 { + continue; + } + let phys_base = pte & PTE_ADDR_MASK; + let virt_base = (pdi << 22) | (pti << 12); + let kind = if (pte & PAGE_AVL_COW) != 0 { + MappingKind::Cow(CowMapping { + readable: true, + executable: true, + }) + } else { + MappingKind::Basic(BasicMapping { + readable: true, + writable: (pte & PAGE_RW) != 0, + executable: true, + }) + }; + mappings.push(Mapping { + phys_base, + virt_base, + len: super::PAGE_SIZE as u64, + kind, + }); + } + } + mappings + } +} + #[cfg(test)] mod tests { use alloc::vec; diff --git a/src/hyperlight_common/src/arch/i686/layout.rs b/src/hyperlight_common/src/arch/i686/layout.rs index f3601c643..85fbff91b 100644 --- a/src/hyperlight_common/src/arch/i686/layout.rs +++ b/src/hyperlight_common/src/arch/i686/layout.rs @@ -14,12 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. */ -// This file is just dummy definitions at the moment, in order to -// allow compiling the guest for real mode boot scenarios. +// i686 layout constants for 32-bit protected mode with paging. pub const MAX_GVA: usize = 0xffff_ffff; pub const MAX_GPA: usize = 0xffff_ffff; -pub fn min_scratch_size(_input_data_size: usize, _output_data_size: usize) -> usize { - crate::vmem::PAGE_SIZE +/// Minimum scratch region size: IO buffers (page-aligned) plus 12 pages +/// for bookkeeping and the exception stack. Page table space is validated +/// separately by `set_pt_size()`. +pub fn min_scratch_size(input_data_size: usize, output_data_size: usize) -> usize { + (input_data_size + output_data_size).next_multiple_of(crate::vmem::PAGE_SIZE) + + 12 * crate::vmem::PAGE_SIZE } diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 64b79d982..3fe6bfc80 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -16,11 +16,11 @@ limitations under the License. #[cfg_attr(target_arch = "x86", path = "arch/i686/layout.rs")] #[cfg_attr( - all(target_arch = "x86_64", not(feature = "nanvix-unstable")), + all(target_arch = "x86_64", not(feature = "i686-guest")), path = "arch/amd64/layout.rs" )] #[cfg_attr( - all(target_arch = "x86_64", feature = "nanvix-unstable"), + all(target_arch = "x86_64", feature = "i686-guest"), path = "arch/i686/layout.rs" )] #[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/layout.rs")] @@ -28,7 +28,7 @@ mod arch; pub use arch::{MAX_GPA, MAX_GVA}; #[cfg(any( - all(target_arch = "x86_64", not(feature = "nanvix-unstable")), + all(target_arch = "x86_64", not(feature = "i686-guest")), target_arch = "aarch64" ))] pub use arch::{SNAPSHOT_PT_GVA_MAX, SNAPSHOT_PT_GVA_MIN}; @@ -39,13 +39,25 @@ pub const SCRATCH_TOP_ALLOCATOR_OFFSET: u64 = 0x10; pub const SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET: u64 = 0x18; pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x20; +/// Offset from the top of scratch for the number of active page directory roots. +/// The guest writes this before signaling boot-complete so the host can walk +/// all active PDs during snapshot creation (not just CR3). +#[cfg(feature = "i686-guest")] +pub const SCRATCH_TOP_PD_ROOTS_COUNT_OFFSET: u64 = 0x28; +/// Offset from the top of scratch for the PD roots array (u32 GPAs on i686). +#[cfg(feature = "i686-guest")] +pub const SCRATCH_TOP_PD_ROOTS_ARRAY_OFFSET: u64 = 0x30; +/// Maximum number of PD roots the guest can expose to the host. +#[cfg(feature = "i686-guest")] +pub const MAX_PD_ROOTS: usize = 32; + /// Offset from the top of scratch memory for a shared host-guest u64 counter. /// /// This is placed at 0x1008 (rather than the next sequential 0x28) so that the /// counter falls in scratch page 0xffffe000 instead of the very last page /// 0xfffff000, which on i686 guests would require frame 0xfffff — exceeding the /// maximum representable frame number. -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] pub const SCRATCH_TOP_GUEST_COUNTER_OFFSET: u64 = 0x1008; pub fn scratch_base_gpa(size: usize) -> u64 { diff --git a/src/hyperlight_common/src/vmem.rs b/src/hyperlight_common/src/vmem.rs index 96de9f334..55f4a9cc6 100644 --- a/src/hyperlight_common/src/vmem.rs +++ b/src/hyperlight_common/src/vmem.rs @@ -19,9 +19,26 @@ limitations under the License. #[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/vmem.rs")] mod arch; +// The `i686-guest` feature is consumed two ways: the guest itself +// compiles for `target_arch = "x86"`, and the x86_64 host compiles +// it to pick up the PT walker under `vmem::i686_guest`. Enabling +// the feature on any other host (e.g. aarch64) would leave the +// re-export missing and produce confusing errors in downstream +// crates — surface it up front. +#[cfg(all( + feature = "i686-guest", + not(any(target_arch = "x86", target_arch = "x86_64")) +))] +compile_error!( + "the `i686-guest` feature is only supported on `target_arch = \"x86\"` (guest) or \ + `target_arch = \"x86_64\"` (host) targets" +); + /// This is always the page size that the /guest/ is being compiled /// for, which may or may not be the same as the host page size. pub use arch::PAGE_SIZE; +#[cfg(all(feature = "i686-guest", target_arch = "x86_64"))] +pub use arch::i686_guest; pub use arch::{PAGE_TABLE_SIZE, PageTableEntry, PhysAddr, VirtAddr}; pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize = PAGE_TABLE_SIZE / core::mem::size_of::(); diff --git a/src/hyperlight_guest/Cargo.toml b/src/hyperlight_guest/Cargo.toml index 3c985d1dc..d9de514ae 100644 --- a/src/hyperlight_guest/Cargo.toml +++ b/src/hyperlight_guest/Cargo.toml @@ -24,4 +24,5 @@ hyperlight-guest-tracing = { workspace = true, default-features = false, optiona [features] default = [] trace_guest = ["dep:hyperlight-guest-tracing", "hyperlight-guest-tracing?/trace"] -nanvix-unstable = ["hyperlight-common/nanvix-unstable"] +i686-guest = ["hyperlight-common/i686-guest"] +guest-counter = ["hyperlight-common/guest-counter"] diff --git a/src/hyperlight_guest/src/layout.rs b/src/hyperlight_guest/src/layout.rs index c1f5839c0..74d03feed 100644 --- a/src/hyperlight_guest/src/layout.rs +++ b/src/hyperlight_guest/src/layout.rs @@ -35,7 +35,7 @@ pub fn snapshot_pt_gpa_base_gva() -> *mut u64 { pub use arch::{scratch_base_gpa, scratch_base_gva}; /// Returns a pointer to the guest counter u64 in scratch memory. -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] pub fn guest_counter_gva() -> *const u64 { use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_GUEST_COUNTER_OFFSET}; (MAX_GVA as u64 - SCRATCH_TOP_GUEST_COUNTER_OFFSET + 1) as *const u64 diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index 1c683e9b9..02f5a9bbd 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -137,7 +137,9 @@ hw-interrupts = [] gdb = ["dep:gdbstub", "dep:gdbstub_arch"] fuzzing = ["hyperlight-common/fuzzing"] build-metadata = ["dep:built"] -nanvix-unstable = ["hyperlight-common/nanvix-unstable"] +i686-guest = ["hyperlight-common/i686-guest"] +nanvix-unstable = ["i686-guest", "hyperlight-common/nanvix-unstable"] +guest-counter = ["hyperlight-common/guest-counter"] [[bench]] name = "benchmarks" diff --git a/src/hyperlight_host/build.rs b/src/hyperlight_host/build.rs index 6f3f9587a..c46c7cb90 100644 --- a/src/hyperlight_host/build.rs +++ b/src/hyperlight_host/build.rs @@ -105,10 +105,8 @@ fn main() -> Result<()> { crashdump: { all(feature = "crashdump", target_arch = "x86_64") }, // print_debug feature is aliased with debug_assertions to make it only available in debug-builds. print_debug: { all(feature = "print_debug", debug_assertions) }, - // the nanvix-unstable and gdb features both (only - // temporarily!) need to use writable/un-shared snapshot - // memories, and so can't share - unshared_snapshot_mem: { any(feature = "nanvix-unstable", feature = "gdb") }, + // gdb needs writable snapshot memory for debug access. + unshared_snapshot_mem: { any(feature = "gdb", feature = "nanvix-unstable") }, } #[cfg(feature = "build-metadata")] diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index 698ab49e5..8b680244b 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -100,12 +100,14 @@ impl HyperlightVm { None => return Err(CreateHyperlightVmError::NoHypervisorFound), }; - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] vm.set_sregs(&CommonSpecialRegisters::standard_64bit_defaults(_pml4_addr)) .map_err(VmError::Register)?; - #[cfg(feature = "nanvix-unstable")] - vm.set_sregs(&CommonSpecialRegisters::standard_real_mode_defaults()) - .map_err(VmError::Register)?; + #[cfg(feature = "i686-guest")] + vm.set_sregs(&CommonSpecialRegisters::standard_32bit_paging_defaults( + _pml4_addr, + )) + .map_err(VmError::Register)?; #[cfg(any(kvm, mshv3))] let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { @@ -248,21 +250,12 @@ impl HyperlightVm { Ok(()) } - /// Get the current base page table physical address. - /// - /// By default, reads CR3 from the vCPU special registers. - /// With `nanvix-unstable`, returns 0 (identity-mapped, no page tables). + /// Get the current base page table physical address from CR3. + #[allow(dead_code)] pub(crate) fn get_root_pt(&self) -> Result { - #[cfg(not(feature = "nanvix-unstable"))] - { - let sregs = self.vm.sregs()?; - // Mask off the flags bits - Ok(sregs.cr3 & !0xfff_u64) - } - #[cfg(feature = "nanvix-unstable")] - { - Ok(0) - } + let sregs = self.vm.sregs()?; + // Mask off the flags bits + Ok(sregs.cr3 & !0xfff_u64) } /// Get the special registers that need to be stored in a snapshot. @@ -352,23 +345,12 @@ impl HyperlightVm { self.vm.set_debug_regs(&CommonDebugRegs::default())?; self.vm.reset_xsave()?; - #[cfg(not(feature = "nanvix-unstable"))] - { - // Restore the full special registers from snapshot, but update CR3 - // to point to the new (relocated) page tables - let mut sregs = *sregs; - sregs.cr3 = cr3; - self.pending_tlb_flush = true; - self.vm.set_sregs(&sregs)?; - } - #[cfg(feature = "nanvix-unstable")] - { - let _ = (cr3, sregs); // suppress unused warnings - // TODO: This is probably not correct. - // Let's deal with it when we clean up the nanvix-unstable feature - self.vm - .set_sregs(&CommonSpecialRegisters::standard_real_mode_defaults())?; - } + // Restore the full special registers from snapshot, but update CR3 + // to point to the new (relocated) page tables + let mut sregs = *sregs; + sregs.cr3 = cr3; + self.pending_tlb_flush = true; + self.vm.set_sregs(&sregs)?; Ok(()) } @@ -874,7 +856,7 @@ pub(super) mod debug { } #[cfg(test)] -#[cfg(not(feature = "nanvix-unstable"))] +#[cfg(not(feature = "i686-guest"))] #[allow(clippy::needless_range_loop)] mod tests { use std::sync::{Arc, Mutex}; diff --git a/src/hyperlight_host/src/hypervisor/regs/x86_64/special_regs.rs b/src/hyperlight_host/src/hypervisor/regs/x86_64/special_regs.rs index e51f32bf2..edd4963af 100644 --- a/src/hyperlight_host/src/hypervisor/regs/x86_64/special_regs.rs +++ b/src/hyperlight_host/src/hypervisor/regs/x86_64/special_regs.rs @@ -28,7 +28,7 @@ use windows::Win32::System::Hypervisor::*; use super::FromWhpRegisterError; cfg_if::cfg_if! { - if #[cfg(not(feature = "nanvix-unstable"))] { + if #[cfg(not(feature = "i686-guest"))] { pub(crate) const CR4_PAE: u64 = 1 << 5; pub(crate) const CR4_OSFXSR: u64 = 1 << 9; pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10; @@ -69,7 +69,7 @@ pub(crate) struct CommonSpecialRegisters { } impl CommonSpecialRegisters { - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] pub(crate) fn standard_64bit_defaults(pml4_addr: u64) -> Self { CommonSpecialRegisters { cs: CommonSegmentRegister { @@ -104,36 +104,54 @@ impl CommonSpecialRegisters { } } - #[cfg(feature = "nanvix-unstable")] - pub(crate) fn standard_real_mode_defaults() -> Self { + /// Returns special registers for 32-bit protected mode with paging enabled. + /// Used for i686 guests that need CoW page tables from boot. + #[cfg(feature = "i686-guest")] + pub(crate) fn standard_32bit_paging_defaults(pd_addr: u64) -> Self { + // Flat 32-bit code segment: base=0, limit=4GB, 32-bit, executable + let code_seg = CommonSegmentRegister { + base: 0, + selector: 0x08, + limit: 0xFFFFFFFF, + type_: 11, // Execute/Read, Accessed + present: 1, + s: 1, + db: 1, // 32-bit + g: 1, // 4KB granularity + ..Default::default() + }; + // Flat 32-bit data segment: base=0, limit=4GB, 32-bit, writable + let data_seg = CommonSegmentRegister { + base: 0, + selector: 0x10, + limit: 0xFFFFFFFF, + type_: 3, // Read/Write, Accessed + present: 1, + s: 1, + db: 1, // 32-bit + g: 1, // 4KB granularity + ..Default::default() + }; + let tr_seg = CommonSegmentRegister { + base: 0, + selector: 0, + limit: 0xFFFF, + type_: 11, + present: 1, + s: 0, + ..Default::default() + }; CommonSpecialRegisters { - cs: CommonSegmentRegister { - base: 0, - selector: 0, - limit: 0xFFFF, - type_: 11, - present: 1, - s: 1, - ..Default::default() - }, - ds: CommonSegmentRegister { - base: 0, - selector: 0, - limit: 0xFFFF, - type_: 3, - present: 1, - s: 1, - ..Default::default() - }, - tr: CommonSegmentRegister { - base: 0, - selector: 0, - limit: 0xFFFF, - type_: 11, - present: 1, - s: 0, - ..Default::default() - }, + cs: code_seg, + ds: data_seg, + es: data_seg, + ss: data_seg, + fs: data_seg, + gs: data_seg, + tr: tr_seg, + cr0: 0x80010011, // PE + ET + WP (write-protect for CoW) + PG + cr3: pd_addr, + cr4: 0, // No PAE, no PSE ..Default::default() } } diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs index c29754e6c..db68dfdd0 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs @@ -36,7 +36,7 @@ use crate::hypervisor::regs::{ CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT, }; -#[cfg(all(test, not(feature = "nanvix-unstable")))] +#[cfg(all(test, not(feature = "i686-guest")))] use crate::hypervisor::virtual_machine::XSAVE_BUFFER_SIZE; #[cfg(feature = "hw-interrupts")] use crate::hypervisor::virtual_machine::x86_64::hw_interrupts::TimerThread; @@ -446,7 +446,7 @@ impl VirtualMachine for KvmVm { } #[cfg(test)] - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> { if std::mem::size_of_val(xsave) != XSAVE_BUFFER_SIZE { return Err(RegisterError::XsaveSizeMismatch { diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index 9edaa1f87..ecb19a09f 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -112,7 +112,7 @@ pub(crate) const XSAVE_MIN_SIZE: usize = 576; /// Standard XSAVE buffer size (4KB) used by KVM and MSHV. /// WHP queries the required size dynamically. -#[cfg(all(any(kvm, mshv3), test, not(feature = "nanvix-unstable")))] +#[cfg(all(any(kvm, mshv3), test, not(feature = "i686-guest")))] pub(crate) const XSAVE_BUFFER_SIZE: usize = 4096; // Compiler error if no hypervisor type is available (not applicable on aarch64 yet) @@ -350,7 +350,7 @@ pub(crate) trait VirtualMachine: Debug + Send { fn reset_xsave(&self) -> std::result::Result<(), RegisterError>; /// Set xsave - only used for tests #[cfg(test)] - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError>; /// Get partition handle diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs index 0a768bd7a..27f024ca6 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mshv/x86_64.rs @@ -51,7 +51,7 @@ use crate::hypervisor::regs::{ CommonDebugRegs, CommonFpu, CommonRegisters, CommonSpecialRegisters, FP_CONTROL_WORD_DEFAULT, MXCSR_DEFAULT, }; -#[cfg(all(test, not(feature = "nanvix-unstable")))] +#[cfg(all(test, not(feature = "i686-guest")))] use crate::hypervisor::virtual_machine::XSAVE_BUFFER_SIZE; #[cfg(feature = "hw-interrupts")] use crate::hypervisor::virtual_machine::x86_64::hw_interrupts::TimerThread; @@ -445,7 +445,7 @@ impl VirtualMachine for MshvVm { } #[cfg(test)] - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> { if std::mem::size_of_val(xsave) != XSAVE_BUFFER_SIZE { return Err(RegisterError::XsaveSizeMismatch { diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs index 3c6ae5a9d..3cc5cc4f2 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/whp.rs @@ -746,7 +746,7 @@ impl VirtualMachine for WhpVm { } #[cfg(test)] - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> { // Get the required buffer size by calling with NULL buffer. // If the buffer is not large enough (0 won't be), WHvGetVirtualProcessorXsaveState returns diff --git a/src/hyperlight_host/src/lib.rs b/src/hyperlight_host/src/lib.rs index 928f82cd2..311c0e989 100644 --- a/src/hyperlight_host/src/lib.rs +++ b/src/hyperlight_host/src/lib.rs @@ -95,7 +95,7 @@ pub use sandbox::UninitializedSandbox; /// The re-export for the `GuestBinary` type pub use sandbox::uninitialized::GuestBinary; /// The re-export for the `GuestCounter` type -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] pub use sandbox::uninitialized::GuestCounter; /// The universal `Result` type used throughout the Hyperlight codebase. diff --git a/src/hyperlight_host/src/mem/exe.rs b/src/hyperlight_host/src/mem/exe.rs index c201592f2..97874ae6e 100644 --- a/src/hyperlight_host/src/mem/exe.rs +++ b/src/hyperlight_host/src/mem/exe.rs @@ -88,6 +88,12 @@ impl ExeInfo { ExeInfo::Elf(elf) => Offset::from(elf.entrypoint_va()), } } + /// Returns the base virtual address of the loaded binary (lowest PT_LOAD p_vaddr). + pub fn base_va(&self) -> u64 { + match self { + ExeInfo::Elf(elf) => elf.get_base_va(), + } + } pub fn loaded_size(&self) -> usize { match self { ExeInfo::Elf(elf) => elf.get_va_size(), diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index b55189969..aa9c913dd 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -140,7 +140,7 @@ impl<'a> ResolvedGpa<&'a [u8], &'a [u8]> { } } #[cfg(any(gdb, feature = "mem_profile"))] -#[allow(unused)] // may be unused when nanvix-unstable is also enabled +#[allow(unused)] // may be unused when i686-guest is also enabled pub(crate) trait ReadableSharedMemory { fn copy_to_slice(&self, slice: &mut [u8], offset: usize) -> Result<()>; } @@ -178,7 +178,7 @@ impl ReadableSharedMemory for T { } #[cfg(any(gdb, feature = "mem_profile"))] impl ResolvedGpa { - #[allow(unused)] // may be unused when nanvix-unstable is also enabled + #[allow(unused)] // may be unused when i686-guest is also enabled pub(crate) fn copy_to_slice(&self, slice: &mut [u8]) -> Result<()> { match &self.base { BaseGpaRegion::Snapshot(sn) => sn.copy_to_slice(slice, self.offset), @@ -239,7 +239,7 @@ pub(crate) struct SandboxMemoryLayout { code_size: usize, // The offset in the sandbox memory where the code starts guest_code_offset: usize, - #[cfg_attr(feature = "nanvix-unstable", allow(unused))] + #[cfg_attr(feature = "i686-guest", allow(unused))] pub(crate) init_data_permissions: Option, // The size of the scratch region in physical memory; note that @@ -316,10 +316,7 @@ impl SandboxMemoryLayout { const MAX_MEMORY_SIZE: usize = (16 * 1024 * 1024 * 1024) - Self::BASE_ADDRESS; // 16 GiB - BASE_ADDRESS /// The base address of the sandbox's memory. - #[cfg(not(feature = "nanvix-unstable"))] pub(crate) const BASE_ADDRESS: usize = 0x1000; - #[cfg(feature = "nanvix-unstable")] - pub(crate) const BASE_ADDRESS: usize = 0x0; // the offset into a sandbox's input/output buffer where the stack starts pub(crate) const STACK_POINTER_SIZE_BYTES: u64 = 8; @@ -619,7 +616,7 @@ impl SandboxMemoryLayout { /// Returns the memory regions associated with this memory layout, /// suitable for passing to a hypervisor for mapping into memory - #[cfg_attr(feature = "nanvix-unstable", allow(unused))] + #[cfg_attr(feature = "i686-guest", allow(unused))] pub(crate) fn get_memory_regions_( &self, host_base: K::HostBaseType, diff --git a/src/hyperlight_host/src/mem/memory_region.rs b/src/hyperlight_host/src/mem/memory_region.rs index 979b260dd..615fe9cac 100644 --- a/src/hyperlight_host/src/mem/memory_region.rs +++ b/src/hyperlight_host/src/mem/memory_region.rs @@ -276,7 +276,7 @@ impl MemoryRegionKind for HostGuestMemoryRegion { /// Type for memory regions that only track guest addresses. /// -#[cfg_attr(feature = "nanvix-unstable", allow(dead_code))] +#[cfg_attr(feature = "i686-guest", allow(dead_code))] #[derive(Debug, PartialEq, Eq, Copy, Clone, Hash)] pub(crate) struct GuestMemoryRegion {} @@ -329,7 +329,7 @@ impl MemoryRegionKind for CrashDumpMemoryRegion { #[cfg(crashdump)] pub(crate) type CrashDumpRegion = MemoryRegion_; -#[cfg(all(crashdump, feature = "nanvix-unstable"))] +#[cfg(all(crashdump, feature = "i686-guest"))] impl HostGuestMemoryRegion { /// Extract the raw `usize` host address from the platform-specific /// host base type. @@ -349,7 +349,7 @@ impl HostGuestMemoryRegion { } } -#[cfg_attr(feature = "nanvix-unstable", allow(unused))] +#[cfg_attr(feature = "i686-guest", allow(unused))] pub(crate) struct MemoryRegionVecBuilder { guest_base_phys_addr: usize, host_base_virt_addr: K::HostBaseType, diff --git a/src/hyperlight_host/src/mem/mgr.rs b/src/hyperlight_host/src/mem/mgr.rs index 98c70734b..bb1812d85 100644 --- a/src/hyperlight_host/src/mem/mgr.rs +++ b/src/hyperlight_host/src/mem/mgr.rs @@ -22,8 +22,9 @@ use hyperlight_common::flatbuffer_wrappers::function_call::{ }; use hyperlight_common::flatbuffer_wrappers::function_types::FunctionCallResult; use hyperlight_common::flatbuffer_wrappers::guest_log_data::GuestLogData; +#[cfg(not(feature = "i686-guest"))] use hyperlight_common::vmem::{self, PAGE_TABLE_SIZE, PageTableEntry, PhysAddr}; -#[cfg(all(feature = "crashdump", not(feature = "nanvix-unstable")))] +#[cfg(all(feature = "crashdump", not(feature = "i686-guest")))] use hyperlight_common::vmem::{BasicMapping, MappingKind}; use tracing::{Span, instrument}; @@ -38,7 +39,7 @@ use crate::mem::memory_region::{CrashDumpRegion, MemoryRegionFlags, MemoryRegion use crate::sandbox::snapshot::{NextAction, Snapshot}; use crate::{Result, new_error}; -#[cfg(all(feature = "crashdump", not(feature = "nanvix-unstable")))] +#[cfg(all(feature = "crashdump", not(feature = "i686-guest")))] fn mapping_kind_to_flags(kind: &MappingKind) -> (MemoryRegionFlags, MemoryRegionType) { match kind { MappingKind::Basic(BasicMapping { @@ -76,7 +77,7 @@ fn mapping_kind_to_flags(kind: &MappingKind) -> (MemoryRegionFlags, MemoryRegion /// in both guest and host address space and has the same flags. /// /// Returns `true` if the region was coalesced, `false` if a new region is needed. -#[cfg(all(feature = "crashdump", not(feature = "nanvix-unstable")))] +#[cfg(all(feature = "crashdump", not(feature = "i686-guest")))] fn try_coalesce_region( regions: &mut [CrashDumpRegion], virt_base: usize, @@ -101,7 +102,7 @@ fn try_coalesce_region( // fact that the snapshot shared memory is `ReadonlySharedMemory` // normally, but there is (temporary) support for writable // `GuestSharedMemory` with `#[cfg(feature = -// "nanvix-unstable")]`. Unfortunately, rustc gets annoyed about an +// "i686-guest")]`. Unfortunately, rustc gets annoyed about an // unused type parameter, unless one goes to a little bit of effort to // trick it... mod unused_hack { @@ -150,11 +151,13 @@ pub(crate) struct SandboxMemoryManager { pub(crate) abort_buffer: Vec, } +#[cfg(not(feature = "i686-guest"))] pub(crate) struct GuestPageTableBuffer { buffer: std::cell::RefCell>, phys_base: usize, } +#[cfg(not(feature = "i686-guest"))] impl vmem::TableReadOps for GuestPageTableBuffer { type TableAddr = (usize, usize); // (table_index, entry_index) @@ -189,6 +192,7 @@ impl vmem::TableReadOps for GuestPageTableBuffer { (self.phys_base / PAGE_TABLE_SIZE, 0) } } +#[cfg(not(feature = "i686-guest"))] impl vmem::TableOps for GuestPageTableBuffer { type TableMovability = vmem::MayNotMoveTable; @@ -219,6 +223,7 @@ impl vmem::TableOps for GuestPageTableBuffer { } } +#[cfg(not(feature = "i686-guest"))] impl GuestPageTableBuffer { pub(crate) fn new(phys_base: usize) -> Self { GuestPageTableBuffer { @@ -270,7 +275,7 @@ where &mut self, sandbox_id: u64, mapped_regions: Vec, - root_pt_gpa: u64, + root_pt_gpas: &[u64], rsp_gva: u64, sregs: CommonSpecialRegisters, entrypoint: NextAction, @@ -282,7 +287,7 @@ where self.layout, crate::mem::exe::LoadInfo::dummy(), mapped_regions, - root_pt_gpa, + root_pt_gpas, rsp_gva, sregs, entrypoint, @@ -334,6 +339,7 @@ impl SandboxMemoryManager { abort_buffer: Vec::new(), // Guest doesn't need abort buffer }; host_mgr.update_scratch_bookkeeping()?; + host_mgr.copy_pt_to_scratch()?; Ok((host_mgr, guest_mgr)) } } @@ -524,9 +530,57 @@ impl SandboxMemoryManager { }; self.layout = *snapshot.layout(); self.update_scratch_bookkeeping()?; + // i686 snapshots store PT bytes separately (not appended to shared_mem) + // to avoid overlapping with map_file_cow regions. + // x86_64 snapshots have PTs appended to shared_mem. + #[cfg(feature = "i686-guest")] + { + let sep_pt = snapshot.separate_pt_bytes(); + self.scratch_mem.with_exclusivity(|scratch| { + scratch.copy_from_slice(sep_pt, self.layout.get_pt_base_scratch_offset()) + })??; + // Rewrite the PD-roots bookkeeping. `restore_snapshot` + // clears scratch above, so without this step a later + // `snapshot()` would read count=0 and fail. Root `i` + // lands at `pt_base_gpa + i * PAGE_SIZE` — the same + // layout `compact_i686_snapshot` used when building the + // rebuilt PDs. + self.update_pd_roots_bookkeeping(snapshot.n_pd_roots())?; + } + #[cfg(not(feature = "i686-guest"))] + self.copy_pt_to_scratch()?; Ok((gsnapshot, gscratch)) } + /// Write the PD-roots count and compacted root GPAs into the + /// scratch bookkeeping area. Called from `restore_snapshot` on + /// the i686-guest path so the scratch state mirrors what it + /// looked like right after the snapshot was taken. + #[cfg(feature = "i686-guest")] + fn update_pd_roots_bookkeeping(&mut self, n_roots: usize) -> Result<()> { + use hyperlight_common::layout::{ + MAX_PD_ROOTS, SCRATCH_TOP_PD_ROOTS_ARRAY_OFFSET, SCRATCH_TOP_PD_ROOTS_COUNT_OFFSET, + }; + if n_roots > MAX_PD_ROOTS { + return Err(crate::new_error!( + "snapshot has {} PD roots, more than MAX_PD_ROOTS={}", + n_roots, + MAX_PD_ROOTS + )); + } + let scratch_size = self.scratch_mem.mem_size(); + let count_off = scratch_size - SCRATCH_TOP_PD_ROOTS_COUNT_OFFSET as usize; + let array_off = scratch_size - SCRATCH_TOP_PD_ROOTS_ARRAY_OFFSET as usize; + self.scratch_mem.write::(count_off, n_roots as u32)?; + let pt_base = self.layout.get_pt_base_gpa(); + for i in 0..n_roots { + let gpa = pt_base + (i as u64) * 4096; + self.scratch_mem + .write::(array_off + i * 4, gpa as u32)?; + } + Ok(()) + } + #[inline] fn update_scratch_bookkeeping_item(&mut self, offset: u64, value: u64) -> Result<()> { let scratch_size = self.scratch_mem.mem_size(); @@ -542,6 +596,10 @@ impl SandboxMemoryManager { SCRATCH_TOP_ALLOCATOR_OFFSET, self.layout.get_first_free_scratch_gpa(), )?; + self.update_scratch_bookkeeping_item( + SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET, + self.layout.get_pt_base_gpa(), + )?; // Initialise the guest input and output data buffers in // scratch memory. TODO: remove the need for this. @@ -554,7 +612,11 @@ impl SandboxMemoryManager { SandboxMemoryLayout::STACK_POINTER_SIZE_BYTES, )?; - // Copy the page tables into the scratch region + Ok(()) + } + + /// Copy page tables from shared_mem into the scratch region. + fn copy_pt_to_scratch(&mut self) -> Result<()> { let snapshot_pt_end = self.shared_mem.mem_size(); let snapshot_pt_size = self.layout.get_pt_size(); let snapshot_pt_start = snapshot_pt_end - snapshot_pt_size; @@ -571,7 +633,6 @@ impl SandboxMemoryManager { #[allow(clippy::needless_borrow)] scratch.copy_from_slice(&bytes, self.layout.get_pt_base_scratch_offset()) })??; - Ok(()) } @@ -579,7 +640,7 @@ impl SandboxMemoryManager { /// /// By default, walks the guest page tables to discover /// GVA→GPA mappings and translates them to host-backed regions. - #[cfg(all(feature = "crashdump", not(feature = "nanvix-unstable")))] + #[cfg(all(feature = "crashdump", not(feature = "i686-guest")))] pub(crate) fn get_guest_memory_regions( &mut self, root_pt: u64, @@ -641,7 +702,7 @@ impl SandboxMemoryManager { /// Without paging, GVA == GPA (identity mapped), so we return the /// snapshot and scratch regions directly at their known addresses /// alongside any dynamic mmap regions. - #[cfg(all(feature = "crashdump", feature = "nanvix-unstable"))] + #[cfg(all(feature = "crashdump", feature = "i686-guest"))] pub(crate) fn get_guest_memory_regions( &mut self, _root_pt: u64, @@ -796,7 +857,7 @@ impl SandboxMemoryManager { } #[cfg(test)] -#[cfg(all(not(feature = "nanvix-unstable"), target_arch = "x86_64"))] +#[cfg(all(not(feature = "i686-guest"), target_arch = "x86_64"))] mod tests { use hyperlight_common::vmem::{MappingKind, PAGE_TABLE_SIZE}; use hyperlight_testing::sandbox_sizes::{LARGE_HEAP_SIZE, MEDIUM_HEAP_SIZE, SMALL_HEAP_SIZE}; diff --git a/src/hyperlight_host/src/mem/shared_mem.rs b/src/hyperlight_host/src/mem/shared_mem.rs index b978b3475..d22dd6702 100644 --- a/src/hyperlight_host/src/mem/shared_mem.rs +++ b/src/hyperlight_host/src/mem/shared_mem.rs @@ -668,7 +668,7 @@ impl ExclusiveSharedMemory { /// Create a [`HostSharedMemory`] view of this region without /// consuming `self`. Used in tests where the full `build()` / /// `evolve()` pipeline is not available. - #[cfg(all(test, feature = "nanvix-unstable"))] + #[cfg(all(test, feature = "guest-counter"))] pub(crate) fn as_host_shared_memory(&self) -> HostSharedMemory { let lock = Arc::new(RwLock::new(())); HostSharedMemory { diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 72de96035..d182bde3e 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -160,10 +160,16 @@ impl MultiUseSandbox { } let mapped_regions_iter = self.vm.get_mapped_regions(); let mapped_regions_vec: Vec = mapped_regions_iter.cloned().collect(); - let root_pt_gpa = self + // Discover page table roots. For i686 guests, read the PD roots + // table from scratch bookkeeping. For x86_64, just use CR3. + #[cfg(feature = "i686-guest")] + let root_pt_gpas = self.read_pd_roots_from_scratch()?; + #[cfg(not(feature = "i686-guest"))] + let root_pt_gpas = [self .vm .get_root_pt() - .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?; + .map_err(|e| HyperlightError::HyperlightVmError(e.into()))?]; + let stack_top_gpa = self.vm.get_stack_top(); let sregs = self .vm @@ -173,7 +179,7 @@ impl MultiUseSandbox { let memory_snapshot = self.mem_mgr.snapshot( self.id, mapped_regions_vec, - root_pt_gpa, + &root_pt_gpas, stack_top_gpa, sregs, entrypoint, @@ -183,6 +189,54 @@ impl MultiUseSandbox { Ok(snapshot) } + /// Reads the PD roots table from the scratch bookkeeping area. + /// Returns an error if the guest did not write valid PD roots + /// before signaling boot-complete. + #[cfg(feature = "i686-guest")] + fn read_pd_roots_from_scratch(&mut self) -> Result> { + use hyperlight_common::layout::{ + MAX_PD_ROOTS, SCRATCH_TOP_PD_ROOTS_ARRAY_OFFSET, SCRATCH_TOP_PD_ROOTS_COUNT_OFFSET, + }; + + let scratch_size = self.mem_mgr.layout.get_scratch_size(); + let count_off = scratch_size - SCRATCH_TOP_PD_ROOTS_COUNT_OFFSET as usize; + let array_off = scratch_size - SCRATCH_TOP_PD_ROOTS_ARRAY_OFFSET as usize; + + self.mem_mgr.scratch_mem.with_contents(|scratch| { + let count = scratch + .get(count_off..count_off + 4) + .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .unwrap_or(0) as usize; + + if count == 0 { + return Err(crate::new_error!( + "i686 guest did not write PD roots to scratch bookkeeping (count=0)" + )); + } + if count > MAX_PD_ROOTS { + return Err(crate::new_error!( + "i686 guest wrote invalid PD roots count: {} (max {})", + count, + MAX_PD_ROOTS + )); + } + + let mut roots = Vec::with_capacity(count); + for i in 0..count { + let off = array_off + i * 4; + let b = scratch.get(off..off + 4).ok_or_else(|| { + crate::new_error!("PD root {} at offset {} is out of scratch bounds", i, off) + })?; + let gpa = u32::from_le_bytes([b[0], b[1], b[2], b[3]]); + if gpa == 0 { + return Err(crate::new_error!("PD root {} has GPA 0", i)); + } + roots.push(gpa as u64); + } + Ok(roots) + })? + } + /// Restores the sandbox's memory to a previously captured snapshot state. /// /// The snapshot must have been created from this same sandbox instance. diff --git a/src/hyperlight_host/src/sandbox/snapshot.rs b/src/hyperlight_host/src/sandbox/snapshot.rs index c5f0520a6..fd4f94a54 100644 --- a/src/hyperlight_host/src/sandbox/snapshot.rs +++ b/src/hyperlight_host/src/sandbox/snapshot.rs @@ -16,8 +16,12 @@ limitations under the License. use std::sync::atomic::{AtomicU64, Ordering}; -use hyperlight_common::layout::{scratch_base_gpa, scratch_base_gva}; -use hyperlight_common::vmem::{self, BasicMapping, CowMapping, Mapping, MappingKind, PAGE_SIZE}; +#[cfg(not(feature = "i686-guest"))] +use hyperlight_common::layout::scratch_base_gpa; +use hyperlight_common::layout::scratch_base_gva; +#[cfg(not(feature = "i686-guest"))] +use hyperlight_common::vmem::{self, BasicMapping, CowMapping}; +use hyperlight_common::vmem::{Mapping, MappingKind, PAGE_SIZE}; use tracing::{Span, instrument}; use crate::HyperlightError::MemoryRegionSizeMismatch; @@ -26,7 +30,9 @@ use crate::hypervisor::regs::CommonSpecialRegisters; use crate::mem::exe::LoadInfo; use crate::mem::layout::SandboxMemoryLayout; use crate::mem::memory_region::MemoryRegion; -use crate::mem::mgr::{GuestPageTableBuffer, SnapshotSharedMemory}; +#[cfg(not(feature = "i686-guest"))] +use crate::mem::mgr::GuestPageTableBuffer; +use crate::mem::mgr::SnapshotSharedMemory; use crate::mem::shared_mem::{ReadonlySharedMemory, SharedMemory}; use crate::sandbox::SandboxConfiguration; use crate::sandbox::uninitialized::{GuestBinary, GuestEnvironment}; @@ -74,6 +80,20 @@ pub struct Snapshot { /// The memory regions that were mapped when this snapshot was /// taken (excluding initial sandbox regions) regions: Vec, + /// Separate PT storage for i686 snapshots where PTs are stored + /// outside the main snapshot memory to avoid overlap with map_file_cow. + #[cfg(feature = "i686-guest")] + separate_pt_bytes: Vec, + /// Number of per-process page-directory roots captured in this + /// snapshot. After restore the PD-root count and the compacted + /// root GPAs are re-populated in the scratch bookkeeping area + /// (`SCRATCH_TOP_PD_ROOTS_{COUNT,ARRAY}_OFFSET`) so a subsequent + /// `snapshot()` call observes a non-zero count. Root GPAs are + /// deterministic: root `i` lands at + /// `layout.get_pt_base_gpa() + i * PAGE_SIZE` (see + /// `compact_i686_snapshot`). + #[cfg(feature = "i686-guest")] + n_pd_roots: usize, /// Extra debug information about the binary in this snapshot, /// from when the binary was first loaded into the snapshot. /// @@ -189,7 +209,12 @@ pub(crate) struct SharedMemoryPageTableBuffer<'a> { scratch: &'a [u8], layout: SandboxMemoryLayout, root: u64, + /// CoW resolution map: maps snapshot GPAs to their CoW'd scratch GPAs. + /// Built by walking the kernel PD to find pages that were CoW'd during boot. + #[cfg(feature = "i686-guest")] + cow_map: Option<&'a std::collections::HashMap>, } + impl<'a> SharedMemoryPageTableBuffer<'a> { pub(crate) fn new( snap: &'a [u8], @@ -202,8 +227,16 @@ impl<'a> SharedMemoryPageTableBuffer<'a> { scratch, layout, root, + #[cfg(feature = "i686-guest")] + cow_map: None, } } + + #[cfg(feature = "i686-guest")] + fn with_cow_map(mut self, cow_map: &'a std::collections::HashMap) -> Self { + self.cow_map = Some(cow_map); + self + } } impl<'a> hyperlight_common::vmem::TableReadOps for SharedMemoryPageTableBuffer<'a> { type TableAddr = u64; @@ -211,19 +244,50 @@ impl<'a> hyperlight_common::vmem::TableReadOps for SharedMemoryPageTableBuffer<' addr + offset } unsafe fn read_entry(&self, addr: u64) -> u64 { - let memoff = access_gpa(self.snap, self.scratch, self.layout, addr); - let Some(pte_bytes) = memoff.and_then(|(mem, off)| mem.get(off..off + 8)) else { - // Attacker-controlled data pointed out-of-bounds. We'll - // default to returning 0 in this case, which, for most - // architectures (including x86-64 and arm64, the ones we - // care about presently) will be a not-present entry. - return 0; + // For i686: if the GPA was CoW'd, read from the scratch copy instead. + #[cfg(feature = "i686-guest")] + let addr = { + let page_gpa = addr & 0xFFFFF000; + if let Some(map) = self.cow_map { + if let Some(&scratch_gpa) = map.get(&page_gpa) { + scratch_gpa + (addr & 0xFFF) + } else { + addr + } + } else { + addr + } }; - // this is statically the correct size, so using unwrap() here - // doesn't make this any more panic-y. - #[allow(clippy::unwrap_used)] - let n: [u8; 8] = pte_bytes.try_into().unwrap(); - u64::from_ne_bytes(n) + let memoff = access_gpa(self.snap, self.scratch, self.layout, addr); + // For i686 guests, page table entries are 4 bytes; for x86_64 they + // are 8 bytes. Read the correct size based on the feature flag. + #[cfg(feature = "i686-guest")] + { + let Some(pte_bytes) = memoff.and_then(|(mem, off)| mem.get(off..off + 4)) else { + // Out-of-bounds: return 0, which is a not-present entry. + return 0; + }; + #[allow(clippy::unwrap_used)] + let n: [u8; 4] = pte_bytes.try_into().unwrap(); + // Page-table entries are little-endian by arch spec; + // use `from_le_bytes` so host endianness doesn't leak in. + u32::from_le_bytes(n) as u64 + } + #[cfg(not(feature = "i686-guest"))] + { + let Some(pte_bytes) = memoff.and_then(|(mem, off)| mem.get(off..off + 8)) else { + // Attacker-controlled data pointed out-of-bounds. We'll + // default to returning 0 in this case, which, for most + // architectures (including x86-64 and arm64, the ones we + // care about presently) will be a not-present entry. + return 0; + }; + // this is statically the correct size, so using unwrap() here + // doesn't make this any more panic-y. + #[allow(clippy::unwrap_used)] + let n: [u8; 8] = pte_bytes.try_into().unwrap(); + u64::from_ne_bytes(n) + } } fn to_phys(addr: u64) -> u64 { addr @@ -240,34 +304,424 @@ impl<'a> core::convert::AsRef> for SharedMemoryP self } } + +/// Build a CoW resolution map by walking a kernel PD. +/// For each PTE that maps a VA in [0, MEMORY_SIZE) to a PA in scratch, +/// record: original_gpa -> scratch_gpa. +#[cfg(feature = "i686-guest")] +fn build_cow_map( + snap: &[u8], + scratch: &[u8], + layout: SandboxMemoryLayout, + kernel_root: u64, +) -> crate::Result> { + use hyperlight_common::layout::scratch_base_gpa; + let mut cow_map = std::collections::HashMap::new(); + let scratch_base = scratch_base_gpa(layout.get_scratch_size()); + let scratch_end = scratch_base + layout.get_scratch_size() as u64; + let mem_size = layout.get_memory_size()? as u64; + + for pdi in 0..1024u64 { + let pde_addr = kernel_root + pdi * 4; + let pde = access_gpa(snap, scratch, layout, pde_addr) + .and_then(|(mem, off)| mem.get(off..off + 4)) + .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .unwrap_or(0); + if (pde & 1) == 0 { + continue; + } + let pt_gpa = (pde & 0xFFFFF000) as u64; + for pti in 0..1024u64 { + let pte_addr = pt_gpa + pti * 4; + let pte = access_gpa(snap, scratch, layout, pte_addr) + .and_then(|(mem, off)| mem.get(off..off + 4)) + .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .unwrap_or(0); + if (pte & 1) == 0 { + continue; + } + let frame_gpa = (pte & 0xFFFFF000) as u64; + let va = (pdi << 22) | (pti << 12); + if va < mem_size && frame_gpa >= scratch_base && frame_gpa < scratch_end { + cow_map.insert(va, frame_gpa); + } + } + } + Ok(cow_map) +} + +/// Helper for building i686 2-level page tables as a flat byte buffer. +/// +/// The buffer stores one or more page directories (PDs) at the front, +/// followed by page tables (PTs) that are allocated on demand. All +/// entries use 4-byte i686 PTEs. +#[cfg(feature = "i686-guest")] +mod i686_pt { + use hyperlight_common::vmem::i686_guest::{PAGE_ACCESSED, PAGE_AVL_COW, PAGE_PRESENT, PAGE_RW}; + + const PTE_PRESENT: u32 = PAGE_PRESENT as u32; + const PTE_RW: u32 = PAGE_RW as u32; + const PTE_ACCESSED: u32 = PAGE_ACCESSED as u32; + pub(super) const PTE_COW: u32 = PAGE_AVL_COW as u32; + pub(super) const ADDR_MASK: u32 = 0xFFFFF000; + pub(super) const RW_FLAGS: u32 = PTE_PRESENT | PTE_RW | PTE_ACCESSED; + const PAGE_SIZE: usize = 4096; + + pub(super) struct Builder { + pub bytes: Vec, + pd_base_gpa: usize, + } + + impl Builder { + pub fn new(pd_base_gpa: usize) -> Self { + Self { + bytes: vec![0u8; PAGE_SIZE], + pd_base_gpa, + } + } + + pub fn with_pds(pd_base_gpa: usize, num_pds: usize) -> Self { + Self { + bytes: vec![0u8; num_pds * PAGE_SIZE], + pd_base_gpa, + } + } + + pub fn read_u32(&self, offset: usize) -> u32 { + let b = &self.bytes[offset..offset + 4]; + u32::from_le_bytes([b[0], b[1], b[2], b[3]]) + } + + fn write_u32(&mut self, offset: usize, val: u32) { + self.bytes[offset..offset + 4].copy_from_slice(&val.to_le_bytes()); + } + + /// Ensures a page table exists for PDE index `pdi` within the PD + /// at byte offset `pd_offset`. Allocates a new PT page at the end + /// of the buffer if absent. Returns the byte offset of the PT. + pub fn ensure_pt(&mut self, pd_offset: usize, pdi: usize, pde_flags: u32) -> usize { + let pde_off = pd_offset + pdi * 4; + let pde = self.read_u32(pde_off); + if (pde & PTE_PRESENT) != 0 { + (pde & ADDR_MASK) as usize - self.pd_base_gpa + } else { + let pt_offset = self.bytes.len(); + self.bytes.resize(pt_offset + PAGE_SIZE, 0); + let pt_gpa = (self.pd_base_gpa + pt_offset) as u32; + self.write_u32(pde_off, pt_gpa | pde_flags); + pt_offset + } + } + + /// Maps a single 4K page within the PD at `pd_offset`. + pub fn map_page(&mut self, pd_offset: usize, va: u64, pa: u64, pte_flags: u32) { + let pdi = ((va as u32 >> 22) & 0x3FF) as usize; + let pti = ((va as u32 >> 12) & 0x3FF) as usize; + let pt_offset = self.ensure_pt(pd_offset, pdi, RW_FLAGS); + let pte_off = pt_offset + pti * 4; + self.write_u32(pte_off, (pa as u32) | pte_flags); + } + + /// Maps a contiguous range of pages with uniform flags. + pub fn map_range( + &mut self, + pd_offset: usize, + va_start: u64, + pa_start: u64, + len: u64, + pte_flags: u32, + ) { + let mut va = va_start; + let mut pa = pa_start; + let end = va_start + len; + while va < end { + self.map_page(pd_offset, va, pa, pte_flags); + va += PAGE_SIZE as u64; + pa += PAGE_SIZE as u64; + } + } + + pub fn into_bytes(self) -> Vec { + self.bytes + } + } +} + +/// Build initial i686 page tables for a freshly loaded guest binary. +/// Maps snapshot regions (with CoW flags for writable pages) and the scratch region. +#[cfg(feature = "i686-guest")] +fn build_initial_i686_page_tables( + layout: &crate::mem::layout::SandboxMemoryLayout, +) -> crate::Result> { + use i686_pt::{PTE_COW, RW_FLAGS}; + + use crate::mem::memory_region::{GuestMemoryRegion, MemoryRegionFlags}; + + let pd_base_gpa = layout.get_pt_base_gpa() as usize; + let mut pt = i686_pt::Builder::new(pd_base_gpa); + + let ro_flags = hyperlight_common::vmem::i686_guest::PAGE_PRESENT as u32 + | hyperlight_common::vmem::i686_guest::PAGE_ACCESSED as u32; + + // 1. Map snapshot memory regions + for rgn in layout.get_memory_regions_::(())?.iter() { + let flags = if rgn.flags.contains(MemoryRegionFlags::WRITE) { + ro_flags | PTE_COW + } else { + ro_flags + }; + pt.map_range( + 0, + rgn.guest_region.start as u64, + rgn.guest_region.start as u64, + rgn.guest_region.len() as u64, + flags, + ); + } + + // 2. Map scratch region (writable, not CoW) + let scratch_size = layout.get_scratch_size(); + let scratch_gpa = hyperlight_common::layout::scratch_base_gpa(scratch_size); + let scratch_gva = hyperlight_common::layout::scratch_base_gva(scratch_size); + pt.map_range(0, scratch_gva, scratch_gpa, scratch_size as u64, RW_FLAGS); + + Ok(pt.into_bytes()) +} + +/// Compact an i686 snapshot: densely pack live pages and rebuild +/// per-process page tables with updated GPAs. +/// +/// Returns `(snapshot_memory, pt_bytes)`. +#[cfg(feature = "i686-guest")] +fn compact_i686_snapshot( + snap: &[u8], + scratch: &[u8], + layout: SandboxMemoryLayout, + live_pages: Vec<(Mapping, &[u8])>, + root_pt_gpas: &[u64], + cow_map: &std::collections::HashMap, + phys_seen: &mut std::collections::HashMap, +) -> crate::Result<(Vec, Vec)> { + use hyperlight_common::vmem::i686_guest::{PAGE_PRESENT, PAGE_USER}; + use i686_pt::{ADDR_MASK, PTE_COW, RW_FLAGS}; + + let page_size: usize = 4096; + + // Phase 1: pack live pages densely into a new snapshot buffer. + let mut snapshot_memory: Vec = Vec::new(); + for (mapping, contents) in live_pages { + if matches!(mapping.kind, MappingKind::Unmapped) { + continue; + } + phys_seen.entry(mapping.phys_base).or_insert_with(|| { + let new_offset = snapshot_memory.len(); + snapshot_memory.extend(contents); + new_offset + SandboxMemoryLayout::BASE_ADDRESS + }); + } + + // Phase 2: build per-process page tables with compacted GPAs. + let pd_base_gpa = layout.get_pt_base_gpa() as usize; + let n_roots = root_pt_gpas.len().max(1); + let mut pt = i686_pt::Builder::with_pds(pd_base_gpa, n_roots); + + let scratch_size = layout.get_scratch_size(); + let scratch_gpa = hyperlight_common::layout::scratch_base_gpa(scratch_size); + + // Helper: read a u32 from guest memory, resolving CoW redirections. + let read_u32 = |gpa: u64| -> u32 { + let resolved = { + let page = gpa & 0xFFFFF000; + cow_map + .get(&page) + .map_or(gpa, |&scratch| scratch + (gpa & 0xFFF)) + }; + access_gpa(snap, scratch, layout, resolved) + .and_then(|(mem, off)| mem.get(off..off + 4)) + .map(|b| u32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .unwrap_or(0) + }; + + // Rebuild a single page table with remapped frame GPAs. + let rebuild_pt = |pt: &mut i686_pt::Builder, + old_pt_gpa: u64, + extra_flags: u32, + phys_map: &std::collections::HashMap| + -> u32 { + let new_pt_offset = pt.bytes.len(); + pt.bytes.resize(new_pt_offset + page_size, 0); + let new_pt_gpa = (pd_base_gpa + new_pt_offset) as u32; + for pti in 0..1024usize { + let pte = read_u32(old_pt_gpa + pti as u64 * 4); + if (pte & PAGE_PRESENT as u32) == 0 { + continue; + } + let old_frame = (pte & ADDR_MASK) as u64; + let Some(&new_gpa) = phys_map.get(&old_frame) else { + continue; + }; + let mut flags = (pte & 0xFFF) | extra_flags; + // Mark writable or already-CoW pages as CoW (read-only + AVL bit). + if (flags & RW_FLAGS & !PTE_COW) != 0 || (flags & PTE_COW) != 0 { + flags = (flags & !(hyperlight_common::vmem::i686_guest::PAGE_RW as u32)) | PTE_COW; + } + let off = new_pt_offset + pti * 4; + pt.bytes[off..off + 4].copy_from_slice(&((new_gpa as u32) | flags).to_le_bytes()); + } + new_pt_gpa + }; + + // Resolve a VA through a PD to its physical frame. + let resolve_through_pd = |pd_gpa: u64, va: u64| -> u64 { + let pdi = (va >> 22) & 0x3FF; + let pde = read_u32(pd_gpa + pdi * 4); + if (pde & PAGE_PRESENT as u32) == 0 { + return va; + } + let pti = (va >> 12) & 0x3FF; + let pte = read_u32((pde & ADDR_MASK) as u64 + pti * 4); + if (pte & PAGE_PRESENT as u32) == 0 { + return va; + } + (pte & ADDR_MASK) as u64 + }; + + // Build kernel page tables (lower 256 PD entries) from the first root. + let first_root = root_pt_gpas.first().copied().ok_or_else(|| { + crate::new_error!("compact_i686_snapshot called with no page directory roots") + })?; + let mut kernel_pdes = [0u32; 256]; + for (pdi, kernel_pde) in kernel_pdes.iter_mut().enumerate() { + let pde = read_u32(first_root + pdi as u64 * 4); + if (pde & PAGE_PRESENT as u32) == 0 { + continue; + } + let new_pt_gpa = rebuild_pt(&mut pt, (pde & ADDR_MASK) as u64, 0, phys_seen); + *kernel_pde = (pde & 0xFFF) | new_pt_gpa; + } + + // Fill in per-process PDs: kernel half (shared) + user half (per-process). + for (root_idx, &root) in root_pt_gpas.iter().enumerate() { + let pd_offset = root_idx * page_size; + // Copy kernel PDEs (lower 256 entries) into this PD. + for (pdi, &kpde) in kernel_pdes.iter().enumerate() { + if kpde != 0 { + pt.bytes[pd_offset + pdi * 4..pd_offset + pdi * 4 + 4] + .copy_from_slice(&kpde.to_le_bytes()); + } + } + // Rebuild user PDEs (upper 256 entries). + for pdi in 256..1024usize { + let pde = read_u32(root + pdi as u64 * 4); + if (pde & PAGE_PRESENT as u32) == 0 { + continue; + } + let user = PAGE_USER as u32; + let pt_gpa_raw = (pde & ADDR_MASK) as u64; + let pt_gpa = resolve_through_pd(first_root, pt_gpa_raw); + let new_pt_gpa = rebuild_pt(&mut pt, pt_gpa, user, phys_seen); + let fixed_pde = (pde & 0xFFF) | new_pt_gpa | user; + pt.bytes[pd_offset + pdi * 4..pd_offset + pdi * 4 + 4] + .copy_from_slice(&fixed_pde.to_le_bytes()); + } + } + + // Map scratch and snapshot identity regions into every PD. + for ri in 0..n_roots { + let pd_off = ri * page_size; + pt.map_range( + pd_off, + scratch_gpa, + scratch_gpa, + scratch_size as u64, + RW_FLAGS, + ); + + let snapshot_end = SandboxMemoryLayout::BASE_ADDRESS + snapshot_memory.len(); + let snapshot_pages = (snapshot_end - SandboxMemoryLayout::BASE_ADDRESS) / page_size; + for pi in 0..snapshot_pages { + let gpa = (SandboxMemoryLayout::BASE_ADDRESS + pi * page_size) as u64; + let pdi = ((gpa >> 22) & 0x3FF) as usize; + let pti = ((gpa >> 12) & 0x3FF) as usize; + let pt_off = pt.ensure_pt(pd_off, pdi, RW_FLAGS); + let pte_off = pt_off + pti * 4; + if pt.read_u32(pte_off) & PAGE_PRESENT as u32 == 0 { + pt.bytes[pte_off..pte_off + 4] + .copy_from_slice(&((gpa as u32) | RW_FLAGS).to_le_bytes()); + } + } + } + + Ok((snapshot_memory, pt.into_bytes())) +} + fn filtered_mappings<'a>( snap: &'a [u8], scratch: &'a [u8], regions: &[MemoryRegion], layout: SandboxMemoryLayout, - root_pt: u64, + root_pts: &[u64], + #[cfg(feature = "i686-guest")] cow_map: &std::collections::HashMap, ) -> Vec<(Mapping, &'a [u8])> { - let op = SharedMemoryPageTableBuffer::new(snap, scratch, layout, root_pt); - unsafe { - hyperlight_common::vmem::virt_to_phys(&op, 0, hyperlight_common::layout::MAX_GVA as u64) - } - .filter_map(move |mapping| { - // the scratch map doesn't count - if mapping.virt_base >= scratch_base_gva(layout.get_scratch_size()) { - return None; + #[cfg(not(feature = "i686-guest"))] + let mappings_iter: Vec = { + let Some(&root_pt) = root_pts.first() else { + return Vec::new(); + }; + let op = SharedMemoryPageTableBuffer::new(snap, scratch, layout, root_pt); + unsafe { + hyperlight_common::vmem::virt_to_phys(&op, 0, hyperlight_common::layout::MAX_GVA as u64) } - // neither does the mapping of the snapshot's own page tables - #[cfg(not(feature = "nanvix-unstable"))] - if mapping.virt_base >= hyperlight_common::layout::SNAPSHOT_PT_GVA_MIN as u64 - && mapping.virt_base <= hyperlight_common::layout::SNAPSHOT_PT_GVA_MAX as u64 - { - return None; + .collect() + }; + + #[cfg(feature = "i686-guest")] + let mappings_iter: Vec = { + use std::collections::HashSet; + let mut mappings = Vec::new(); + let mut seen_phys = HashSet::new(); + + let scratch_base_gva_val = + hyperlight_common::layout::scratch_base_gva(layout.get_scratch_size()); + for &root_pt in root_pts { + let op = SharedMemoryPageTableBuffer::new(snap, scratch, layout, root_pt) + .with_cow_map(cow_map); + let root_mappings = + unsafe { hyperlight_common::vmem::i686_guest::virt_to_phys_all(&op) }; + for m in root_mappings { + // Skip mappings whose VA is in the scratch region - these + // are identity-mapped helpers and would poison seen_phys for + // legitimate user mappings that share the same scratch PAs. + if m.virt_base >= scratch_base_gva_val { + continue; + } + if seen_phys.insert(m.phys_base) { + mappings.push(m); + } + } } - // todo: is it useful to warn if we can't resolve this? - let contents = unsafe { guest_page(snap, scratch, regions, layout, mapping.phys_base) }?; - Some((mapping, contents)) - }) - .collect() + mappings + }; + + mappings_iter + .into_iter() + .filter_map(move |mapping| { + // the scratch map doesn't count + if mapping.virt_base >= scratch_base_gva(layout.get_scratch_size()) { + return None; + } + // neither does the mapping of the snapshot's own page tables + #[cfg(not(feature = "i686-guest"))] + if mapping.virt_base >= hyperlight_common::layout::SNAPSHOT_PT_GVA_MIN as u64 + && mapping.virt_base <= hyperlight_common::layout::SNAPSHOT_PT_GVA_MAX as u64 + { + return None; + } + let contents = + unsafe { guest_page(snap, scratch, regions, layout, mapping.phys_base) }?; + Some((mapping, contents)) + }) + .collect() } /// Find the contents of the page which starts at gpa in guest physical @@ -293,6 +747,7 @@ unsafe fn guest_page<'a>( Some(&resolved.as_ref()[..PAGE_SIZE]) } +#[cfg(not(feature = "i686-guest"))] fn map_specials(pt_buf: &GuestPageTableBuffer, scratch_size: usize) { // Map the scratch region let mapping = Mapping { @@ -342,7 +797,7 @@ impl Snapshot { let guest_blob_size = blob.as_ref().map(|b| b.data.len()).unwrap_or(0); let guest_blob_mem_flags = blob.as_ref().map(|b| b.permissions); - #[cfg_attr(feature = "nanvix-unstable", allow(unused_mut))] + #[cfg_attr(feature = "i686-guest", allow(unused_mut))] let mut layout = crate::mem::layout::SandboxMemoryLayout::new( cfg, exe_info.loaded_size(), @@ -351,7 +806,8 @@ impl Snapshot { )?; let load_addr = layout.get_guest_code_address() as u64; - let entrypoint_offset: u64 = exe_info.entrypoint().into(); + let base_va = exe_info.base_va(); + let entrypoint_va: u64 = exe_info.entrypoint().into(); let mut memory = vec![0; layout.get_memory_size()?]; @@ -365,7 +821,7 @@ impl Snapshot { blob.map(|x| layout.write_init_data(&mut memory, x.data)) .transpose()?; - #[cfg(not(feature = "nanvix-unstable"))] + #[cfg(not(feature = "i686-guest"))] { // Set up page table entries for the snapshot let pt_buf = GuestPageTableBuffer::new(layout.get_pt_base_gpa() as usize); @@ -405,6 +861,12 @@ impl Snapshot { layout.set_pt_size(pt_bytes.len())?; memory.extend(&pt_bytes); }; + #[cfg(feature = "i686-guest")] + { + let pt_bytes = build_initial_i686_page_tables(&layout)?; + layout.set_pt_size(pt_bytes.len())?; + memory.extend(&pt_bytes); + }; let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET @@ -422,7 +884,11 @@ impl Snapshot { hash, stack_top_gva: exn_stack_top_gva, sregs: None, - entrypoint: NextAction::Initialise(load_addr + entrypoint_offset), + #[cfg(feature = "i686-guest")] + separate_pt_bytes: Vec::new(), + #[cfg(feature = "i686-guest")] + n_pd_roots: 0, + entrypoint: NextAction::Initialise(load_addr + entrypoint_va - base_va), }) } @@ -442,7 +908,7 @@ impl Snapshot { mut layout: SandboxMemoryLayout, load_info: LoadInfo, regions: Vec, - root_pt_gpa: u64, + root_pt_gpas: &[u64], stack_top_gva: u64, sregs: CommonSpecialRegisters, entrypoint: NextAction, @@ -451,54 +917,99 @@ impl Snapshot { let mut phys_seen = HashMap::::new(); let memory = shared_mem.with_contents(|snap_c| { scratch_mem.with_contents(|scratch_c| { - // Pass 1: count how many pages need to live - let live_pages = - filtered_mappings(snap_c, scratch_c, ®ions, layout, root_pt_gpa); + // Build CoW resolution map (i686 only): maps original GPAs + // to their CoW'd scratch GPAs so the PT walker can read the + // actual page table data instead of stale snapshot copies. + #[cfg(feature = "i686-guest")] + let cow_map = { + let kernel_root = root_pt_gpas.first().copied().ok_or_else(|| { + crate::new_error!("snapshot requires at least one page directory root") + })?; + build_cow_map(snap_c, scratch_c, layout, kernel_root)? + }; + + // Pass 1: collect live pages + let live_pages = filtered_mappings( + snap_c, + scratch_c, + ®ions, + layout, + root_pt_gpas, + #[cfg(feature = "i686-guest")] + &cow_map, + ); - // Pass 2: copy them, and map them + // Pass 2: copy live pages and build new page tables // TODO: Look for opportunities to hugepage map - let pt_buf = GuestPageTableBuffer::new(layout.get_pt_base_gpa() as usize); - let mut snapshot_memory: Vec = Vec::new(); - for (mapping, contents) in live_pages { - let kind = match mapping.kind { - MappingKind::Cow(cm) => MappingKind::Cow(cm), - MappingKind::Basic(bm) if bm.writable => MappingKind::Cow(CowMapping { - readable: bm.readable, - executable: bm.executable, - }), - MappingKind::Basic(bm) => MappingKind::Basic(BasicMapping { - readable: bm.readable, - writable: false, - executable: bm.executable, - }), - MappingKind::Unmapped => continue, - }; - let new_gpa = phys_seen.entry(mapping.phys_base).or_insert_with(|| { - let new_offset = snapshot_memory.len(); - snapshot_memory.extend(contents); - new_offset + SandboxMemoryLayout::BASE_ADDRESS - }); - let mapping = Mapping { - phys_base: *new_gpa as u64, - virt_base: mapping.virt_base, - len: PAGE_SIZE as u64, - kind, - }; - unsafe { vmem::map(&pt_buf, mapping) }; - } - // Phase 3: Map the special mappings - map_specials(&pt_buf, layout.get_scratch_size()); - let pt_bytes = pt_buf.into_bytes(); - layout.set_pt_size(pt_bytes.len())?; - snapshot_memory.extend(&pt_bytes); - Ok::, crate::HyperlightError>(snapshot_memory) + #[cfg(not(feature = "i686-guest"))] + let (snapshot_memory, pt_bytes) = { + let mut snapshot_memory: Vec = Vec::new(); + let pt_buf = GuestPageTableBuffer::new(layout.get_pt_base_gpa() as usize); + for (mapping, contents) in live_pages { + let kind = match mapping.kind { + MappingKind::Cow(cm) => MappingKind::Cow(cm), + MappingKind::Basic(bm) if bm.writable => MappingKind::Cow(CowMapping { + readable: bm.readable, + executable: bm.executable, + }), + MappingKind::Basic(bm) => MappingKind::Basic(BasicMapping { + readable: bm.readable, + writable: false, + executable: bm.executable, + }), + MappingKind::Unmapped => continue, + }; + let new_gpa = phys_seen.entry(mapping.phys_base).or_insert_with(|| { + let new_offset = snapshot_memory.len(); + snapshot_memory.extend(contents); + new_offset + SandboxMemoryLayout::BASE_ADDRESS + }); + let mapping = Mapping { + phys_base: *new_gpa as u64, + virt_base: mapping.virt_base, + len: PAGE_SIZE as u64, + kind, + }; + unsafe { vmem::map(&pt_buf, mapping) }; + } + map_specials(&pt_buf, layout.get_scratch_size()); + let pt_data = pt_buf.into_bytes(); + layout.set_pt_size(pt_data.len())?; + snapshot_memory.extend(&pt_data); + (snapshot_memory, Vec::new()) + }; + + #[cfg(feature = "i686-guest")] + let (snapshot_memory, pt_bytes) = { + let (mem, pt) = compact_i686_snapshot( + snap_c, + scratch_c, + layout, + live_pages, + root_pt_gpas, + &cow_map, + &mut phys_seen, + )?; + layout.set_pt_size(pt.len())?; + (mem, pt) + }; + + Ok::<(Vec, Vec), crate::HyperlightError>((snapshot_memory, pt_bytes)) }) })???; + #[cfg(feature = "i686-guest")] + let (memory, separate_pt_bytes) = memory; + #[cfg(not(feature = "i686-guest"))] + let (memory, _) = memory; layout.set_snapshot_size(memory.len()); - // We do not need the original regions anymore, as any uses of - // them in the guest have been incorporated into the snapshot - // properly. + // For i686, keep the regions so the RAMFS and other map_file_cow + // mappings are accessible after restore. For x86_64, we do not + // need the original regions anymore, as any uses of them in the + // guest have been incorporated into the snapshot properly. + #[cfg(feature = "i686-guest")] + let regions = regions; + #[cfg(not(feature = "i686-guest"))] let regions = Vec::new(); let hash = hash(&memory, ®ions)?; @@ -511,6 +1022,10 @@ impl Snapshot { hash, stack_top_gva, sregs: Some(sregs), + #[cfg(feature = "i686-guest")] + separate_pt_bytes, + #[cfg(feature = "i686-guest")] + n_pd_roots: root_pt_gpas.len(), entrypoint, }) } @@ -557,6 +1072,20 @@ impl Snapshot { self.sregs.as_ref() } + #[cfg(feature = "i686-guest")] + pub(crate) fn separate_pt_bytes(&self) -> &[u8] { + &self.separate_pt_bytes + } + + /// Number of per-process page-directory roots captured in this + /// snapshot. Used by `restore_snapshot` to rewrite the scratch + /// PD-roots bookkeeping so a later `snapshot()` call doesn't + /// observe a stale zero count. + #[cfg(feature = "i686-guest")] + pub(crate) fn n_pd_roots(&self) -> usize { + self.n_pd_roots + } + pub(crate) fn entrypoint(&self) -> NextAction { self.entrypoint } @@ -569,6 +1098,7 @@ impl PartialEq for Snapshot { } #[cfg(test)] +#[cfg(not(feature = "i686-guest"))] mod tests { use hyperlight_common::vmem::{self, BasicMapping, Mapping, MappingKind, PAGE_SIZE}; @@ -637,7 +1167,7 @@ mod tests { mgr.layout, LoadInfo::dummy(), Vec::new(), - pt_base, + &[pt_base], 0, default_sregs(), super::NextAction::None, @@ -653,7 +1183,7 @@ mod tests { mgr.layout, LoadInfo::dummy(), Vec::new(), - pt_base, + &[pt_base], 0, default_sregs(), super::NextAction::None, @@ -673,3 +1203,341 @@ mod tests { .unwrap(); } } + +#[cfg(test)] +#[cfg(feature = "i686-guest")] +mod tests { + use std::collections::HashMap; + + use hyperlight_common::vmem::i686_guest::{PAGE_ACCESSED, PAGE_PRESENT, PAGE_RW}; + use hyperlight_common::vmem::{BasicMapping, Mapping, MappingKind}; + + use super::i686_pt::{self, ADDR_MASK, PTE_COW, RW_FLAGS}; + use crate::mem::layout::SandboxMemoryLayout; + use crate::mem::memory_region::{GuestMemoryRegion, MemoryRegionFlags}; + use crate::sandbox::SandboxConfiguration; + + const PAGE_SIZE: usize = 4096; + + struct TestEnv { + layout: SandboxMemoryLayout, + snap: Vec, + scratch: Vec, + pt_base: u64, + } + + fn make_env(pt_bytes: &[u8]) -> TestEnv { + let mut cfg = SandboxConfiguration::default(); + cfg.set_heap_size(PAGE_SIZE as u64); + let layout = SandboxMemoryLayout::new(cfg, PAGE_SIZE, PAGE_SIZE, None).unwrap(); + let scratch_size = layout.get_scratch_size(); + let snapshot_size = layout.get_memory_size().unwrap(); + let snap = vec![0u8; snapshot_size]; + let mut scratch = vec![0u8; scratch_size]; + + let pt_scratch_offset = layout.get_pt_base_scratch_offset(); + assert!(pt_scratch_offset + pt_bytes.len() <= scratch.len(),); + + scratch[pt_scratch_offset..pt_scratch_offset + pt_bytes.len()].copy_from_slice(pt_bytes); + + TestEnv { + snap, + scratch, + layout, + pt_base: layout.get_pt_base_gpa(), + } + } + + /// Decode a PTE from raw page table bytes at the given VA. + fn read_pte(pt_bytes: &[u8], pt_base_gpa: usize, va: u64) -> u32 { + let pdi = ((va >> 22) & 0x3FF) as usize; + let pti = ((va >> 12) & 0x3FF) as usize; + let pde = u32::from_le_bytes(pt_bytes[pdi * 4..pdi * 4 + 4].try_into().unwrap()); + assert_ne!( + pde & PAGE_PRESENT as u32, + 0, + "PDE for VA {va:#x} not present" + ); + let pt_offset = (pde & ADDR_MASK) as usize - pt_base_gpa; + u32::from_le_bytes( + pt_bytes[pt_offset + pti * 4..pt_offset + pti * 4 + 4] + .try_into() + .unwrap(), + ) + } + + #[test] + fn builder_map_page_writes_pde_and_pte() { + let pd_base = 0x10_0000; + let mut b = i686_pt::Builder::new(pd_base); + let va = 0x0040_0000u64; // PD index 1, PT index 0 + let pa = 0x0020_0000u64; + b.map_page(0, va, pa, RW_FLAGS); + + let pde = b.read_u32(4); + assert_ne!(pde & PAGE_PRESENT as u32, 0, "PDE should be present"); + assert_eq!((pde & ADDR_MASK) as usize, pd_base + PAGE_SIZE); + + let pte = b.read_u32(PAGE_SIZE); // PT index 0 + assert_eq!(pte & ADDR_MASK, pa as u32); + assert_eq!(pte & 0xFFF, RW_FLAGS); + + // Map a second page in the same 4MB region - PT must be reused + b.map_page(0, 0x0040_1000, 0x20_1000, RW_FLAGS); + assert_eq!(b.bytes.len(), 2 * PAGE_SIZE, "PT should be reused"); + let pte1 = b.read_u32(PAGE_SIZE + 4); + assert_eq!(pte1 & ADDR_MASK, 0x20_1000); + } + + #[test] + fn builder_map_range_crosses_pde_boundary() { + let pd_base = 0x10_0000; + let mut b = i686_pt::Builder::new(pd_base); + // Last page of PD[0] to first page of PD[1] + let va_start = 0x003F_F000u64; + let pa_start = 0x5_0000u64; + b.map_range(0, va_start, pa_start, 2 * PAGE_SIZE as u64, RW_FLAGS); + + assert_eq!(b.bytes.len(), 3 * PAGE_SIZE, "should allocate 2 PTs"); + + // Verify PTE contents across the boundary + let pt0_offset = (b.read_u32(0) & ADDR_MASK) as usize - pd_base; + let pte_last = b.read_u32(pt0_offset + 0x3FF * 4); // last entry in PT[0] + assert_eq!(pte_last & ADDR_MASK, pa_start as u32); + + let pt1_offset = (b.read_u32(4) & ADDR_MASK) as usize - pd_base; + let pte_first = b.read_u32(pt1_offset); // first entry in PT[1] + assert_eq!(pte_first & ADDR_MASK, (pa_start + PAGE_SIZE as u64) as u32); + } + + #[test] + fn builder_cow_flags_preserved_pde_stays_rw() { + let pd_base = 0x10_0000; + let mut b = i686_pt::Builder::new(pd_base); + let cow_flags = PAGE_PRESENT as u32 | PAGE_ACCESSED as u32 | PTE_COW; + b.map_page(0, 0x1000, 0x2000, cow_flags); + + let pti = ((0x1000u64 >> 12) & 0x3FF) as usize; + let pte = b.read_u32(PAGE_SIZE + pti * 4); + assert_ne!(pte & PTE_COW, 0, "CoW bit should be set on PTE"); + assert_eq!(pte & PAGE_RW as u32, 0, "RW should be clear for CoW PTE"); + + // PDE must remain RW so the CPU can walk the PT + let pde = b.read_u32(0); + assert_ne!( + pde & PAGE_RW as u32, + 0, + "PDE must stay RW even for CoW PTEs" + ); + } + + #[test] + fn builder_multiple_pds_independent() { + let pd_base = 0x10_0000; + let mut b = i686_pt::Builder::with_pds(pd_base, 2); + b.map_page(0, 0x1000, 0xA000, RW_FLAGS); + b.map_page(PAGE_SIZE, 0x1000, 0xB000, RW_FLAGS); + + // PTs start after the 2 PD pages + let pde0 = b.read_u32(0); + let pde1 = b.read_u32(PAGE_SIZE); + assert_eq!( + (pde0 & ADDR_MASK) as usize, + pd_base + 2 * PAGE_SIZE, + "PD[0] PT should be at first slot after PDs" + ); + assert_eq!( + (pde1 & ADDR_MASK) as usize, + pd_base + 3 * PAGE_SIZE, + "PD[1] PT should be at second slot after PDs" + ); + + // Verify the PTEs point to the correct PAs + let pti = ((0x1000u64 >> 12) & 0x3FF) as usize; + let pte0 = b.read_u32(2 * PAGE_SIZE + pti * 4); + let pte1 = b.read_u32(3 * PAGE_SIZE + pti * 4); + assert_eq!(pte0 & ADDR_MASK, 0xA000); + assert_eq!(pte1 & ADDR_MASK, 0xB000); + } + + #[test] + fn cow_map_finds_scratch_backed_pages() { + let cfg = SandboxConfiguration::default(); + let scratch_size = cfg.get_scratch_size(); + let scratch_base = hyperlight_common::layout::scratch_base_gpa(scratch_size); + let layout = SandboxMemoryLayout::new(cfg, PAGE_SIZE, PAGE_SIZE, None).unwrap(); + let pt_base = layout.get_pt_base_gpa() as usize; + + let mut b = i686_pt::Builder::new(pt_base); + let cow_frame = scratch_base + 0x5000; + let cow_va = 0x1000u64; + b.map_page(0, cow_va, cow_frame, RW_FLAGS); + + let TestEnv { + snap, + scratch, + layout, + pt_base, + } = make_env(&b.into_bytes()); + let cow_map = super::build_cow_map(&snap, &scratch, layout, pt_base).unwrap(); + + assert_eq!(cow_map.len(), 1); + assert_eq!(cow_map[&cow_va], cow_frame); + } + + #[test] + fn cow_map_filtering() { + let cfg = SandboxConfiguration::default(); + let scratch_size = cfg.get_scratch_size(); + let scratch_base = hyperlight_common::layout::scratch_base_gpa(scratch_size); + let layout = SandboxMemoryLayout::new(cfg, PAGE_SIZE, PAGE_SIZE, None).unwrap(); + let pt_base = layout.get_pt_base_gpa() as usize; + let mem_size = layout.get_memory_size().unwrap(); + + let mut b = i686_pt::Builder::new(pt_base); + b.map_page( + 0, + 0x1000, + SandboxMemoryLayout::BASE_ADDRESS as u64, + RW_FLAGS, + ); + let far_va = (mem_size as u64).next_multiple_of(0x0040_0000); + b.map_page(0, far_va, scratch_base + 0x1000, RW_FLAGS); + + let TestEnv { + snap, + scratch, + layout, + pt_base, + } = make_env(&b.into_bytes()); + let cow_map = super::build_cow_map(&snap, &scratch, layout, pt_base).unwrap(); + + assert!( + cow_map.is_empty(), + "neither non-scratch nor beyond-mem-size VAs should appear" + ); + } + + #[test] + fn cow_map_empty_pd() { + let cfg = SandboxConfiguration::default(); + let layout = SandboxMemoryLayout::new(cfg, PAGE_SIZE, PAGE_SIZE, None).unwrap(); + let pt_base = layout.get_pt_base_gpa() as usize; + let b = i686_pt::Builder::new(pt_base); + + let TestEnv { + snap, + scratch, + layout, + pt_base, + } = make_env(&b.into_bytes()); + let cow_map = super::build_cow_map(&snap, &scratch, layout, pt_base).unwrap(); + + assert!(cow_map.is_empty()); + } + + #[test] + fn initial_pt_scratch_rw_and_region_flags() { + let cfg = SandboxConfiguration::default(); + let layout = SandboxMemoryLayout::new(cfg, PAGE_SIZE, PAGE_SIZE, None).unwrap(); + + let pt_bytes = super::build_initial_i686_page_tables(&layout).unwrap(); + let pt_base = layout.get_pt_base_gpa() as usize; + + // Scratch must be mapped as RW without CoW + let scratch_size = layout.get_scratch_size(); + let scratch_gva = hyperlight_common::layout::scratch_base_gva(scratch_size); + let scratch_pte = read_pte(&pt_bytes, pt_base, scratch_gva); + assert_ne!(scratch_pte & PAGE_PRESENT as u32, 0); + assert_ne!(scratch_pte & PAGE_RW as u32, 0, "scratch must be writable"); + assert_eq!(scratch_pte & PTE_COW, 0, "scratch must not be CoW"); + + // Verify region permissions: writable -> CoW, read-only -> no CoW + let regions = layout.get_memory_regions_::(()).unwrap(); + + for rgn in ®ions { + let is_writable = rgn.flags.contains(MemoryRegionFlags::WRITE); + let va = rgn.guest_region.start as u64; + let pte = read_pte(&pt_bytes, pt_base, va); + assert_ne!(pte & PAGE_PRESENT as u32, 0); + if is_writable { + assert_ne!(pte & PTE_COW, 0, "writable region at {va:#x} should be CoW"); + assert_eq!(pte & PAGE_RW as u32, 0, "CoW at {va:#x} must clear RW"); + } else { + assert_eq!(pte & PTE_COW, 0, "RO region at {va:#x} must not be CoW"); + } + } + } + + #[test] + fn compact_deduplicates_shared_physical_pages() { + let shared_phys = 0x2000u64; + let page_data = vec![0xAAu8; PAGE_SIZE]; + + let make_mapping = |virt_base: u64| Mapping { + phys_base: shared_phys, + virt_base, + len: PAGE_SIZE as u64, + kind: MappingKind::Basic(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + + let TestEnv { + snap, + scratch, + layout, + pt_base, + } = make_env(&[0u8; PAGE_SIZE]); + + let cow_map = HashMap::new(); + let mut phys_seen = HashMap::new(); + + let live_pages: Vec<(Mapping, &[u8])> = vec![ + (make_mapping(0x1000), &page_data), + (make_mapping(0x5000), &page_data), + ]; + + let (snapshot_mem, _pt_bytes) = super::compact_i686_snapshot( + &snap, + &scratch, + layout, + live_pages, + &[pt_base], + &cow_map, + &mut phys_seen, + ) + .unwrap(); + + assert_eq!( + snapshot_mem.len(), + PAGE_SIZE, + "shared physical page should be deduplicated" + ); + } + + #[test] + fn compact_empty_roots_returns_error() { + let TestEnv { + snap, + scratch, + layout, + .. + } = make_env(&[0u8; PAGE_SIZE]); + let cow_map = HashMap::new(); + let mut phys_seen = HashMap::new(); + + let result = super::compact_i686_snapshot( + &snap, + &scratch, + layout, + Vec::new(), + &[], + &cow_map, + &mut phys_seen, + ); + assert!(result.is_err(), "empty root_pt_gpas should return an error"); + } +} diff --git a/src/hyperlight_host/src/sandbox/uninitialized.rs b/src/hyperlight_host/src/sandbox/uninitialized.rs index e737d08da..23c01be28 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized.rs @@ -31,7 +31,7 @@ use crate::func::{ParameterTuple, SupportedReturnType}; use crate::log_build_details; use crate::mem::memory_region::{DEFAULT_GUEST_BLOB_MEM_FLAGS, MemoryRegionFlags}; use crate::mem::mgr::SandboxMemoryManager; -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] use crate::mem::shared_mem::HostSharedMemory; use crate::mem::shared_mem::{ExclusiveSharedMemory, SharedMemory}; use crate::sandbox::SandboxConfiguration; @@ -76,26 +76,26 @@ pub(crate) struct SandboxRuntimeConfig { /// /// Only one `GuestCounter` may be created per sandbox; a second call to /// [`UninitializedSandbox::guest_counter()`] returns an error. -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] pub struct GuestCounter { inner: Mutex, } -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] struct GuestCounterInner { deferred_hshm: Arc>>, offset: usize, value: u64, } -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] impl core::fmt::Debug for GuestCounter { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("GuestCounter").finish_non_exhaustive() } } -#[cfg(feature = "nanvix-unstable")] +#[cfg(feature = "guest-counter")] impl GuestCounter { /// Increments the counter by one and writes it to guest memory. pub fn increment(&self) -> Result<()> { @@ -174,12 +174,12 @@ pub struct UninitializedSandbox { /// view of scratch memory. Code that needs host-style volatile access /// before `evolve()` (e.g. `GuestCounter`) can clone this `Arc` and /// will see `Some` once `evolve()` completes. - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] pub(crate) deferred_hshm: Arc>>, /// Set to `true` once a [`GuestCounter`] has been handed out via /// [`guest_counter()`](Self::guest_counter). Prevents creating /// multiple counters that would have divergent cached values. - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] counter_taken: std::sync::atomic::AtomicBool, /// File mappings prepared by [`Self::map_file_cow`] that will be /// applied to the VM during [`Self::evolve`]. @@ -287,7 +287,7 @@ impl UninitializedSandbox { /// /// This method can only be called once; a second call returns an error /// because multiple counters would have divergent cached values. - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] pub fn guest_counter(&mut self) -> Result { use std::sync::atomic::Ordering; @@ -376,9 +376,9 @@ impl UninitializedSandbox { rt_cfg, load_info: snapshot.load_info(), stack_top_gva: snapshot.stack_top_gva(), - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] deferred_hshm: Arc::new(Mutex::new(None)), - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] counter_taken: std::sync::atomic::AtomicBool::new(false), pending_file_mappings: Vec::new(), }; @@ -552,7 +552,7 @@ impl UninitializedSandbox { /// Populate the deferred `HostSharedMemory` slot without running /// the full `evolve()` pipeline. Used in tests where guest boot /// is not available. - #[cfg(all(test, feature = "nanvix-unstable"))] + #[cfg(all(test, feature = "guest-counter"))] fn simulate_build(&self) { let hshm = self.mgr.scratch_mem.as_host_shared_memory(); #[allow(clippy::unwrap_used)] @@ -1569,7 +1569,7 @@ mod tests { } } - #[cfg(feature = "nanvix-unstable")] + #[cfg(feature = "guest-counter")] mod guest_counter_tests { use hyperlight_testing::simple_guest_as_string; diff --git a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs index 428594d37..7f0cc1c0d 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs @@ -41,7 +41,7 @@ pub(super) fn evolve_impl_multi_use(u_sbox: UninitializedSandbox) -> Result