diff --git a/Cargo.lock b/Cargo.lock index 7fefcf2..3955f89 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,56 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + [[package]] name = "anyhow" version = "1.0.101" @@ -28,18 +78,74 @@ name = "cargo-hyperlight" version = "0.1.7" dependencies = [ "anyhow", + "clap", "console", "const_format", "glob", "libc", + "object", "os_str_bytes", "regex", + "rustc-demangle", "semver", "serde", "serde_json", + "tempfile", "which", ] +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "console" version = "0.16.2" @@ -95,12 +201,42 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + [[package]] name = "glob" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + [[package]] name = "itoa" version = "1.0.15" @@ -125,12 +261,27 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + [[package]] name = "os_str_bytes" version = "7.1.1" @@ -158,6 +309,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "regex" version = "1.12.3" @@ -187,6 +344,12 @@ version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + [[package]] name = "rustix" version = "1.1.2" @@ -253,6 +416,12 @@ dependencies = [ "zmij", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "syn" version = "2.0.111" @@ -264,6 +433,19 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tempfile" +version = "3.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" +dependencies = [ + "fastrand", + "getrandom", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "unicode-ident" version = "1.0.22" @@ -282,6 +464,21 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + [[package]] name = "which" version = "8.0.0" @@ -315,6 +512,12 @@ version = "0.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" + [[package]] name = "zmij" version = "1.0.12" diff --git a/Cargo.toml b/Cargo.toml index d4b0c28..4d19c21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,13 +10,17 @@ description = "cargo subcommand to build hyperlight guest binaries" [dependencies] anyhow = "1.0" +clap = { version = "4", features = ["derive"] } console = "0.16" const_format = "0.2" glob = "0.3" libc = "0.2" +object = { version = "0.36", default-features = false, features = ["read", "elf"] } os_str_bytes = "7.1.1" regex = "1.12" +rustc-demangle = "0.1" semver = { version = "1.0", features = ["serde"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +tempfile = "3" which = { version = "8", features = ["regex"] } diff --git a/src/main.rs b/src/main.rs index c84d690..0fdff8a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,28 +2,36 @@ use std::env; use cargo_hyperlight::cargo; +mod perf; + const VERSION: &str = env!("CARGO_PKG_VERSION"); const GIT_HASH: &str = env!("GIT_HASH"); const GIT_DATE: &str = env!("GIT_DATE"); fn main() { - if env::args().any(|arg| arg == "--version" || arg == "-V") { - println!("cargo-hyperlight {} ({} {})", VERSION, GIT_HASH, GIT_DATE); - return; + // Skip binary name; when invoked as `cargo hyperlight`, cargo passes + // "hyperlight" as argv[1] — skip that too. + let mut args = env::args_os().skip(1).peekable(); + if args.peek().is_some_and(|a| a == "hyperlight") { + args.next(); } - let args = env::args_os().enumerate().filter_map(|(i, arg)| { - // skip the binary name and the "hyperlight" subcommand if present - if i == 0 || (i == 1 && arg == "hyperlight") { - None - } else { - Some(arg) + match args.peek().map(|a| a.to_os_string()) { + Some(a) if a == "--version" || a == "-V" => { + println!("cargo-hyperlight {} ({} {})", VERSION, GIT_HASH, GIT_DATE); } - }); - - cargo() - .expect("Failed to create cargo command") - .args(args) - .status() - .expect("Failed to execute cargo") + Some(a) if a == "perf" => { + if let Err(e) = perf::run(args) { + eprintln!("{e:?}"); + std::process::exit(1); + } + } + _ => { + cargo() + .expect("Failed to create cargo command") + .args(args) + .status() + .expect("Failed to execute cargo"); + } + } } diff --git a/src/perf.rs b/src/perf.rs new file mode 100644 index 0000000..ac20454 --- /dev/null +++ b/src/perf.rs @@ -0,0 +1,628 @@ +// Copyright 2026 The Hyperlight Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! `cargo hyperlight perf` — Profile Hyperlight guest execution with `perf kvm`. +//! +//! This subcommand automates the workflow of generating guest symbol information +//! and running `perf kvm` to profile code executing inside Hyperlight micro-VMs. +//! +//! # How it works +//! +//! Hyperlight loads guest PIE ELF binaries at a configurable base address (default +//! `0x1000` with init-paging). `perf kvm` resolves guest samples using a +//! kallsyms-format text file (`--guestkallsyms`) containing symbol addresses +//! shifted to match the runtime guest layout (ELF VA + base address). +//! +//! This command: +//! 1. Reads the guest ELF binary using the `object` crate +//! 2. Generates a kallsyms file with addresses shifted by the base address +//! 3. Runs `perf kvm record` with the appropriate flags +//! 4. Displays a `perf kvm report` with demangled symbols +//! +//! To mitigate sample misattribution on pre-Ice Lake CPUs (see below), the +//! generated kallsyms includes synthetic `__gap__` symbols between functions +//! wherever there are inter-function regions (alignment padding, unused code). +//! This prevents perf's `symbols__fixup_end()` from stretching function ranges +//! across gaps, which would cause skidded NMI samples to be misattributed to +//! the preceding function. +//! +//! **Important:** The gap marker name must NOT use bracket characters (e.g. +//! `[gap]`), because perf's kallsyms parser interprets `[name]` as a kernel +//! module annotation (like `/proc/kallsyms` lines ending in `[module_name]`). +//! Using brackets corrupts the symbol table and causes addresses inside +//! nearby functions to become unresolvable (shown as raw hex in reports). +//! +//! # Why gap markers are needed (and when they matter) +//! +//! ## The kallsyms format has no size information +//! +//! `perf kvm --guestkallsyms` accepts a kallsyms-format file: lines of +//! `address type name` — nothing else. This format was designed for Linux +//! kernel profiling, where `/proc/kallsyms` lists kernel symbols that are +//! typically laid out contiguously with no gaps. Crucially, **kallsyms does +//! not carry `st_size`** — there is no way to express a symbol's extent. +//! +//! ## `symbols__fixup_end()` assumes contiguous layout +//! +//! Since kallsyms has no size field, perf's `symbols__fixup_end()` +//! (tools/perf/util/symbol.c) sets each symbol's end address to the start +//! of the next symbol. For contiguous kernel text this is correct, but for +//! a general ELF binary it's wrong: functions may have alignment padding +//! or gaps from linker section placement between them. Without gap markers, +//! `symbols__fixup_end()` would stretch each function's range to the next +//! function. +//! +//! ## Why `perf kvm` can't just read the ELF +//! +//! Normal userspace profiling (`perf record ./binary`) doesn't have this +//! problem — perf reads the ELF directly via `/proc//maps` + the +//! binary's `.symtab`/`.dynsym`, which include `st_size`. But KVM guest +//! profiling goes through a completely different code path: the guest RIP +//! in samples is a guest virtual address with no associated host process +//! or `/proc` mapping. `perf kvm` resolves these addresses using the +//! kallsyms mechanism (designed for kernel symbol resolution), which has +//! no concept of ELF symbol sizes. There is no `--guest-elf` option. +//! +//! ## When gap markers matter +//! +//! **Pre-Ice Lake (no guest PEBS):** NMI skid causes the sampled guest RIP +//! to be tens to hundreds of instructions away from the true overflow point. +//! Skidded samples can land in gap regions (alignment padding, unused code). +//! Without gap markers, `symbols__fixup_end()` stretches the preceding +//! function's range to cover the gap, and these skidded samples are +//! misattributed to that function. Gap markers absorb these samples instead. +//! +//! **Ice Lake+ (guest PEBS, `precise_ip=3`):** PEBS records the exact +//! instruction that retired at counter overflow. Gap regions contain no +//! executable code (only alignment padding, `nop`/`int3` bytes), so no +//! instruction ever retires there and no sample will have an IP in a gap. +//! Whether `symbols__fixup_end()` stretches ranges across gaps or not has +//! no effect on `perf report` output — the sample counts are identical +//! either way. Gap markers are harmless but have no practical impact. +//! +//! # Modes +//! +//! - **Guest-only** (default): `perf kvm record` captures only guest samples. +//! - **Combined** (`--host`): `perf kvm --host --guest` captures host and +//! guest samples scoped to the workload process tree. +//! +//! # Requirements +//! +//! The guest ELF binary must contain a `.symtab` section with function symbols. +//! Debug info (`.debug_*` sections) is **not** needed — only the symbol table +//! matters. Rust release builds (which omit debug info by default) work fine +//! since `.symtab` is preserved. For Rust, only `strip = "symbols"` or +//! `strip = true` in the Cargo profile will remove `.symtab` and break +//! profiling. For C/C++, `strip -s` / `--strip-all` has the same effect; +//! `strip --strip-debug` is safe. +//! +//! # Limitations +//! +//! Flat profiles only (no guest call stacks). `perf kvm` cannot unwind the +//! guest stack because guest virtual addresses are not resolvable through host +//! page tables. +//! +//! # Known issue: guest IP imprecision on pre-Ice Lake CPUs +//! +//! On pre-Ice Lake CPUs, `perf kvm` guest profiles are **unreliable for +//! function-level attribution**. Samples may appear in never-called functions. +//! This is a hardware limitation, not a software bug. +//! +//! ## Root cause +//! +//! Guest PEBS is only available on Ice Lake+. On older CPUs, `perf kvm` +//! falls back to NMI-based sampling (`precise_ip=0`). The PMU counter +//! overflows at instruction X, but the NMI is recognized many instructions +//! later (skid). The NMI triggers a VMEXIT, and KVM reads `GUEST_RIP` from +//! the VMCS—which reflects the skidded position, not the overflow point. +//! +//! The KVM path: `vmx_vcpu_enter_exit()` → NMI exit → `vmx_do_nmi_irqoff()` +//! → host NMI handler → `perf_instruction_pointer()` → `kvm_rip_read(vcpu)` +//! → `vmcs_readl(GUEST_RIP)`. +//! +//! ## Consequences +//! +//! On Broadwell, empirical analysis showed the captured IPs are **byte-level +//! random** within hot code regions: +//! +//! - Most guest IPs land at non-instruction-boundary addresses, at a rate +//! matching random chance given the average x86 instruction length. +//! - IPs do cluster in genuinely hot ~KB-scale code regions (cold code +//! gets zero samples), but within those regions the byte position is +//! random. +//! - Function attribution is proportional to byte size, not execution +//! frequency. Large functions in hot regions attract disproportionate +//! samples even if never called. +//! +//! ## Workarounds +//! +//! - **Native profiling**: Build guest code as a native binary and profile +//! with `perf record -e cycles:pp` for PEBS-quality results. +//! - **Upgrade to Ice Lake+**: Enables guest PEBS with `precise_ip=3`. +//! - **Treat profiles as region-level heatmaps**: ~KB-scale region hotness +//! is valid; per-function percentages are not. + +use std::ffi::OsString; +use std::fmt::Write as _; +use std::fs; +use std::io::{BufRead, BufReader}; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +use anyhow::{Context, Result, bail}; +use clap::Parser; +use object::read::elf::ElfFile64; +use object::{Endianness, Object, ObjectSection, ObjectSymbol, SymbolKind}; + +/// Default base address where Hyperlight loads guest binaries (init-paging). +const DEFAULT_BASE_ADDRESS: u64 = 0x1000; + +/// Default sampling frequency in Hz (prime near 5 kHz to avoid aliasing). +const DEFAULT_FREQ: u32 = 4999; + +/// Default output file for perf data. +const DEFAULT_OUTPUT: &str = "/tmp/perf_guest.data"; + +/// Profile Hyperlight guest execution with perf kvm (Linux/KVM only). +/// +/// Records CPU cycle samples inside Hyperlight micro-VMs and displays a +/// flat profile with demangled Rust symbols. +#[derive(Parser, Debug)] +#[command(name = "perf")] +struct PerfArgs { + /// Path to the guest ELF binary. + guest_binary: PathBuf, + + /// Command to run as the profiling workload. + #[arg(last = true, required = true, num_args = 1..)] + workload: Vec, + + /// Sampling frequency in Hz. + #[arg(long, default_value_t = DEFAULT_FREQ)] + freq: u32, + + /// Output perf.data path. + #[arg(long, default_value = DEFAULT_OUTPUT)] + output: PathBuf, + + /// Include host kernel and userspace samples alongside guest samples. + #[arg(long)] + host: bool, + + /// Group report by guest, kernel, and userspace (requires --host). + #[arg(long, requires = "host")] + group: bool, + + /// Guest load base address (hex with 0x prefix or decimal). + #[arg(long, default_value_t = DEFAULT_BASE_ADDRESS, value_parser = parse_hex_or_dec)] + base_address: u64, +} + +/// Main entry point for `cargo hyperlight perf`. +/// +/// The iterator should start with the subcommand name ("perf"), which +/// clap consumes as the binary name (argv\[0\]). +pub fn run(args: impl Iterator) -> Result<()> { + let args = PerfArgs::parse_from(args); + + // Verify prerequisites + check_prerequisites()?; + + // Step 1–2: Read guest ELF and generate kallsyms (with gap markers) + let kallsyms_content = generate_kallsyms(&args.guest_binary, args.base_address)?; + + let kallsyms_file = tempfile::Builder::new() + .suffix(".kallsyms") + .tempfile() + .context("Failed to create temp file for kallsyms")?; + fs::write(kallsyms_file.path(), &kallsyms_content).context("Failed to write kallsyms file")?; + + // Step 3: Record with perf kvm + let mode_label = if args.host { + "host+guest" + } else { + "guest-only" + }; + eprintln!( + "Recording {mode_label} cycles @ {} Hz -> {}", + args.freq, + args.output.display() + ); + + eprintln!( + "Workload: {}", + args.workload + .iter() + .map(|a| a.to_string_lossy()) + .collect::>() + .join(" ") + ); + eprintln!(); + + record_perf(&args, kallsyms_file.path())?; + eprintln!(); + + // Step 4: Report + report_perf(&args, kallsyms_file.path())?; + + eprintln!(); + eprintln!("Data saved to {}", args.output.display()); + + Ok(()) +} + +/// Check that we're on Linux with KVM and perf available. +fn check_prerequisites() -> Result<()> { + #[cfg(not(target_os = "linux"))] + { + bail!("cargo hyperlight perf requires Linux with KVM"); + } + + #[cfg(target_os = "linux")] + { + which("perf").context("perf not found (install linux-perf / perf-tools / linux-tools)")?; + + let kvm = Path::new("/dev/kvm"); + if !kvm.exists() { + bail!("No KVM device found at /dev/kvm"); + } + + // Check perf_event_paranoid + if let Ok(val) = fs::read_to_string("/proc/sys/kernel/perf_event_paranoid") + && let Ok(n) = val.trim().parse::() + && n > 1 + { + eprintln!( + "Warning: perf_event_paranoid={n} (need <=1). Run: sudo sysctl kernel.perf_event_paranoid=-1" + ); + } + + // Detect if we're running inside a VM or WSL2. The CPU sets the + // "hypervisor" flag (CPUID leaf 0x1, ECX bit 31) when running under + // a hypervisor. In a VM the host PMU is virtualized and may not + // support the events needed for `perf kvm` guest profiling — samples + // may be missing or empty. WSL2 runs in a Hyper-V VM and has the + // same limitation. + if let Ok(cpuinfo) = fs::read_to_string("/proc/cpuinfo") + && cpuinfo + .lines() + .any(|l| l.starts_with("flags") && l.contains(" hypervisor")) + { + eprintln!( + "Warning: running inside a VM (hypervisor CPU flag detected). \ + The virtualized PMU may not support KVM guest profiling — \ + you may get zero guest samples. For reliable results, run \ + on bare-metal hardware." + ); + } + + Ok(()) + } +} + +/// Generate a kallsyms-format string from the guest ELF binary. +/// +/// For each defined symbol with a nonzero address, the output line is: +/// `{address + base_address:016x} T {name}` +/// +/// Symbols are sorted by address ascending (as required by kallsyms format). +/// We also inject `_text` and `_stext` symbols at the `.text` section address +/// so that `perf kvm` can set up the guest kernel map. +/// +/// ## Why we can't just emit raw symbols +/// +/// The kallsyms format (`address type name`) carries no size information. +/// `perf kvm` processes these symbols through `symbols__fixup_end()` +/// (tools/perf/util/symbol.c), which extends each symbol's range to the +/// start of the next symbol — a heuristic designed for contiguous kernel +/// text. For general ELF binaries with gaps between functions (alignment +/// padding, dead code, linker-placed sections), this causes misattribution: +/// samples in gaps are credited to the preceding function. +/// +/// Unlike userspace profiling where perf reads the ELF's `.symtab` with +/// `st_size` via `/proc//maps`, KVM guest samples are guest virtual +/// addresses with no host-side process or memory mapping. `perf kvm` has +/// no `--guest-elf` option and cannot read symbol sizes from the binary. +/// +/// ## Gap markers +/// +/// To compensate, we read `st_size` from the ELF ourselves and inject +/// synthetic `__gap__` markers at each function's true end whenever a +/// gap exists before the next function. `symbols__fixup_end()` then clips +/// each real symbol at its true boundary. On pre-Ice Lake CPUs (no guest +/// PEBS), NMI skid causes samples to land in gap regions — the `__gap__` +/// markers absorb these instead of letting them inflate a neighboring +/// function. On Ice Lake+ with PEBS, no sample lands in gaps (no code +/// executes there), so the markers have no practical effect but are +/// harmless. +fn generate_kallsyms(guest_binary: &Path, base_address: u64) -> Result { + let data = fs::read(guest_binary) + .with_context(|| format!("Cannot read {}", guest_binary.display()))?; + + let elf = ElfFile64::::parse(&*data) + .with_context(|| format!("Failed to parse ELF: {}", guest_binary.display()))?; + + // Find .text section address for _text/_stext injection (after dedup). + let text_addr = elf + .section_by_name(".text") + .map(|s| s.address() + base_address); + + // Collect (shifted_addr, size, name) for all defined function symbols. + let mut syms: Vec<(u64, u64, String)> = Vec::new(); + + for sym in elf.symbols() { + // Only include function symbols (STT_FUNC / STT_GNU_IFUNC). + // Data, section, and NOTYPE symbols have addresses in .rodata/.data + // that interleave with .text, causing wrong attribution and bad gaps. + if sym.kind() != SymbolKind::Text { + continue; + } + + let name = match sym.name() { + Ok(n) if !n.is_empty() => n.to_string(), + _ => continue, + }; + + let addr = sym.address(); + if addr == 0 { + continue; + } + + syms.push((addr + base_address, sym.size(), name)); + } + + // Sort by address, then by size descending (so largest-size symbol + // comes first at each address — this matters for ICF dedup below). + syms.sort_by(|a, b| a.0.cmp(&b.0).then(b.1.cmp(&a.1))); + + if syms.is_empty() { + bail!( + "No symbols found in {}. Is it a stripped binary?", + guest_binary.display() + ); + } + + // Deduplicate ICF (Identical Code Folding) symbols. + // + // ICF merges function bodies with identical machine code, leaving + // multiple symbol names at the same address with the same size. + // perf's symbols__fixup_end() would give all but the last symbol + // at the same address a zero-length range, so we keep only the + // first (largest-size) symbol at each address. + let total_before_dedup = syms.len(); + syms.dedup_by_key(|s| s.0); + let dedup_removed = total_before_dedup - syms.len(); + + // After dedup every address is unique, so gap computation below + // can assume each consecutive pair has distinct addresses. + + // Build the final symbol list, injecting gap markers between + // consecutive symbols wherever there are inter-function regions + // (alignment padding, unused code, linker-placed data). + // + // For each symbol we compute an effective end: + // - st_size > 0: effective_end = min(addr + st_size, next_addr) + // The min() handles inflated st_size from ICF or over-estimated + // linker sizes that overlap the next symbol. + // - st_size == 0: the function's true size is unknown. We let + // symbols__fixup_end() extend it to the next symbol (correct + // for contiguous code). No gap is injected. + // + // A gap marker is placed at effective_end whenever it falls short + // of the next symbol's address, absorbing PMU samples that land + // in dead/unreachable code. + let mut final_syms: Vec<(u64, String)> = Vec::with_capacity(syms.len() * 2); + let mut gap_count: usize = 0; + + for (i, (addr, size, name)) in syms.iter().enumerate() { + final_syms.push((*addr, name.clone())); + + if let Some((next_addr, _, _)) = syms.get(i + 1) { + let effective_end = if *size > 0 { + (*addr + *size).min(*next_addr) + } else { + // Size unknown — let symbols__fixup_end() extend to + // next symbol (no gap injection). + continue; + }; + + if effective_end < *next_addr { + final_syms.push((effective_end, "__gap__".to_string())); + gap_count += 1; + } + } + } + + // Inject _text and _stext AFTER gap computation — perf kvm requires + // these markers at the .text section address to set up the guest + // kernel map. They are inserted into the final symbol list (not + // into `syms`) to avoid interfering with gap detection: if they + // were present during gap iteration, the same-address entries + // would prevent gap injection after the first .text function. + if let Some(addr) = text_addr { + final_syms.push((addr, "_text".to_string())); + final_syms.push((addr, "_stext".to_string())); + final_syms.sort_by_key(|s| s.0); + } + + let mut output = String::new(); + for (addr, name) in &final_syms { + let demangled = rustc_demangle::demangle(name); + writeln!(output, "{addr:016x} T {demangled:#}").unwrap(); + } + + // Dump kallsyms to a debug file for inspection. + let debug_path = "/tmp/debug_kallsyms.txt"; + if let Err(e) = fs::write(debug_path, &output) { + eprintln!("Warning: could not write debug kallsyms to {debug_path}: {e}"); + } + + eprintln!( + "Prepared {} guest symbols ({} ICF duplicates removed), {} gap markers (base +{:#x})", + syms.len(), + dedup_removed, + gap_count, + base_address + ); + + Ok(output) +} + +/// Build the common `perf kvm` argument prefix used by both record and report. +fn perf_kvm_args(args: &PerfArgs, kallsyms: &Path) -> Vec { + let mut perf_args: Vec = vec!["kvm".into()]; + if args.host { + perf_args.push("--host".into()); + perf_args.push("--guest".into()); + } + perf_args.push(format!("--guestkallsyms={}", kallsyms.display()).into()); + perf_args +} + +/// Run `perf kvm record` scoped to the workload process tree. +fn record_perf(args: &PerfArgs, kallsyms: &Path) -> Result<()> { + let mut perf_args = perf_kvm_args(args, kallsyms); + perf_args.extend([ + "record".into(), + "-e".into(), // event selector + "cycles".into(), // hardware CPU cycle counter + "-F".into(), // sampling frequency in Hz + args.freq.to_string().into(), + "-o".into(), // output file path + args.output.as_os_str().to_owned(), + "--".into(), + ]); + perf_args.extend(args.workload.iter().cloned()); + + let status = Command::new("perf") + .args(&perf_args) + .status() + .context("Failed to execute perf")?; + + // perf kvm record passes through the workload's exit code, so any + // non-zero may just mean the workload itself returned non-zero (which + // is fine — data was still recorded). Only warn rather than bail. + if let Some(code) = status.code() + && code != 0 + { + eprintln!("Warning: perf kvm record exited with status {code} (workload may have failed)"); + } + + Ok(()) +} + +/// Run `perf kvm report` and format the output. +fn report_perf(args: &PerfArgs, kallsyms: &Path) -> Result<()> { + if args.host { + eprintln!("Host + guest profile:"); + if !args.group { + eprintln!(" [.] = userspace [k] = kernel [g] = guest"); + } + } else { + eprintln!("Guest profile:"); + } + + let mut perf_args = perf_kvm_args(args, kallsyms); + perf_args.extend([ + "report".into(), + "-i".into(), + args.output.as_os_str().to_owned(), + "--stdio".into(), + "--no-children".into(), + "-F".into(), + "overhead,sym".into(), + ]); + + let mut child = Command::new("perf") + .args(&perf_args) + .stderr(Stdio::piped()) + .stdout(Stdio::piped()) + .spawn() + .context("Failed to execute perf report")?; + + let stdout = child.stdout.take().expect("stdout piped"); + + let lines = collect_report_lines(stdout)?; + + if args.group { + print_grouped(&lines); + } else if args.host { + for line in &lines { + println!("{}", line.trim_end()); + } + } else { + for line in &lines { + println!("{}", line.replace("[g] ", "").trim_end()); + } + } + + let status = child.wait().context("Failed to wait for perf report")?; + if !status.success() { + bail!("perf report exited with status {status}"); + } + + Ok(()) +} + +/// Read perf report stdout, filtering out header/comment lines. +fn collect_report_lines(stdout: impl std::io::Read) -> Result> { + let mut lines = Vec::new(); + for line in BufReader::new(stdout).lines() { + let line = line.context("Failed to read perf output")?; + if line.starts_with('#') || line.is_empty() { + continue; + } + lines.push(line); + } + Ok(lines) +} + +/// Print report lines grouped by guest, kernel, and userspace. +fn print_grouped(lines: &[String]) { + let (mut guest, mut kernel, mut user) = (Vec::new(), Vec::new(), Vec::new()); + for line in lines { + if line.contains("[g]") { + guest.push(line); + } else if line.contains("[k]") { + kernel.push(line); + } else { + user.push(line); + } + } + + for (header, group) in [("Guest", &guest), ("Kernel", &kernel), ("Userspace", &user)] { + if group.is_empty() { + continue; + } + println!("\n {header}:"); + for line in group { + println!("{}", line.trim_end()); + } + } +} + +/// Parse a number as hex (0x prefix) or decimal. +fn parse_hex_or_dec(s: &str) -> Result { + if let Some(hex) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { + u64::from_str_radix(hex, 16).map_err(|e| format!("invalid hex number '{s}': {e}")) + } else { + s.parse::() + .map_err(|e| format!("invalid number '{s}': {e}")) + } +} + +#[cfg(target_os = "linux")] +fn which(cmd: &str) -> Result { + which::which(cmd).with_context(|| format!("{cmd} not found on PATH")) +}