diff --git a/cmd/vminitd/main.go b/cmd/vminitd/main.go index bde10dd..ce2c195 100644 --- a/cmd/vminitd/main.go +++ b/cmd/vminitd/main.go @@ -16,6 +16,7 @@ import ( "github.com/containerd/log" "golang.org/x/sys/unix" + "github.com/spin-stack/spinbox/internal/guest/vminit/boottime" "github.com/spin-stack/spinbox/internal/guest/vminit/config" "github.com/spin-stack/spinbox/internal/guest/vminit/service" "github.com/spin-stack/spinbox/internal/guest/vminit/system" @@ -33,6 +34,10 @@ const ( ) func main() { + // Stamp PID1 entry against CLOCK_BOOTTIME as early as possible: this is the + // clean kernel-boot number (kernel start -> init exec). + boottime.LogReady(context.Background(), "pid1-entry") + cfg, setFlags, configFile, err := config.ParseFlags(os.Args[1:]) if err != nil { if errors.Is(err, config.ErrVersionRequested) { @@ -100,6 +105,7 @@ func run(ctx context.Context, cfg *config.ServiceConfig) error { if err := system.Initialize(ctx, prof); err != nil { return err } + boottime.LogReady(ctx, "system-init") if cfg.Debug { systools.DumpInfo(ctx) diff --git a/integration/vminitd_ready_test.go b/integration/vminitd_ready_test.go new file mode 100644 index 0000000..2c0f679 --- /dev/null +++ b/integration/vminitd_ready_test.go @@ -0,0 +1,151 @@ +//go:build linux && integration + +package integration + +import ( + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "testing" + "time" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/containerd/v2/pkg/cio" + "github.com/containerd/containerd/v2/pkg/namespaces" + "github.com/containerd/containerd/v2/pkg/oci" +) + +// vminitdReadyRE matches a VMINITD_READY line, e.g.: +// +// VMINITD_READY uptime_us=118233 phase=pid1-entry +var vminitdReadyRE = regexp.MustCompile(`VMINITD_READY uptime_us=(\d+) phase=(\S+)`) + +// readyPhaseOrder is the expected emission order; used to print a decomposition. +var readyPhaseOrder = []string{"pid1-entry", "system-init", "vsock-listen", "serve", "first-accept"} + +// TestVminitdReady boots a VM on the normal path and reports the VMINITD_READY +// CLOCK_BOOTTIME milestones, which decompose the guest-boot half of cold-start: +// +// pid1-entry -> kernel boot (kernel start -> init exec) +// system-init - pid1-entry -> vminitd system.Initialize +// vsock-listen - system-init -> listener bind +// serve - vsock-listen-> plugin loading +// first-accept - serve -> host-readiness lag (guest ready -> host connects) +// +// This is the clean split BOOT_TIMELINE's guest_boot (tVsock - tQMP) could not +// give, since that lumps kernel + vminitd + handshake together. +func TestVminitdReady(t *testing.T) { + cfg := loadTestConfig() + + client := setupContainerdClient(t, cfg) + defer client.Close() + + ensureImagePulled(t, client, cfg) + + ctx := namespaces.WithNamespace(t.Context(), cfg.Namespace) + + image, err := client.GetImage(ctx, cfg.Image) + if err != nil { + t.Fatalf("get image %s: %v", cfg.Image, err) + } + + name := "qbx-ready-" + strings.ReplaceAll(time.Now().Format("150405.000"), ".", "") + + container, err := client.NewContainer(ctx, name, + containerd.WithSnapshotter(cfg.Snapshotter), + containerd.WithImage(image), + containerd.WithNewSnapshot(name+"-snapshot", image), + containerd.WithRuntime(cfg.Runtime, nil), + containerd.WithNewSpec( + oci.WithImageConfig(image), + oci.WithProcessArgs("/bin/echo", "BOOTED"), + ), + ) + if err != nil { + t.Fatalf("create container %s: %v", name, err) + } + defer func() { + if err := container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { + t.Logf("cleanup container %s: %v", name, err) + } + }() + + task, err := container.NewTask(ctx, cio.NullIO) + if err != nil { + t.Fatalf("create task for %s: %v", name, err) + } + defer func() { + if _, err := task.Delete(ctx, containerd.WithProcessKill); err != nil { + if !strings.Contains(err.Error(), "ttrpc: closed") { + t.Logf("cleanup task for %s: %v", name, err) + } + } + }() + + if _, err := task.Wait(ctx); err != nil { + t.Fatalf("wait for task %s: %v", name, err) + } + if err := task.Start(ctx); err != nil { + t.Fatalf("start task %s: %v", name, err) + } + waitForTaskStatus(t, ctx, task, containerd.Running) + + consolePath := filepath.Join(logDirBase(), name, "console.log") + phases := waitForVminitdReady(t, consolePath, 30*time.Second) + + for _, p := range readyPhaseOrder { + if us, ok := phases[p]; ok { + t.Logf("VMINITD_READY %-12s uptime=%6d us (%.1f ms)", p, us, float64(us)/1000) + } + } + // Deltas attribute each segment. + prev := "pid1-entry" + for _, p := range readyPhaseOrder[1:] { + cur, ok := phases[p] + pv, okPrev := phases[prev] + if ok && okPrev { + t.Logf("VMINITD_READY delta %-24s = %6d us (%.1f ms)", prev+"->"+p, cur-pv, float64(cur-pv)/1000) + prev = p + } + } +} + +// waitForVminitdReady polls the console log until the pid1-entry stamp appears +// and returns phase->uptime_us. +func waitForVminitdReady(t *testing.T, consolePath string, timeout time.Duration) map[string]int { + t.Helper() + deadline := time.Now().Add(timeout) + for { + phases := parseVminitdReady(consolePath) + if _, ok := phases["pid1-entry"]; ok { + return phases + } + if time.Now().After(deadline) { + data, _ := os.ReadFile(consolePath) //nolint:errcheck // best-effort diagnostics + t.Fatalf("no VMINITD_READY output in %s after %v; console tail:\n%s", + consolePath, timeout, lastLines(string(data), 20)) + } + time.Sleep(200 * time.Millisecond) + } +} + +// parseVminitdReady returns the first uptime seen per phase. +func parseVminitdReady(consolePath string) map[string]int { + data, err := os.ReadFile(consolePath) + if err != nil { + return nil + } + phases := make(map[string]int) + for _, m := range vminitdReadyRE.FindAllStringSubmatch(string(data), -1) { + us, err := strconv.Atoi(m[1]) + if err != nil { + continue + } + if _, seen := phases[m[2]]; !seen { + phases[m[2]] = us + } + } + return phases +} diff --git a/internal/guest/vminit/boottime/boottime.go b/internal/guest/vminit/boottime/boottime.go new file mode 100644 index 0000000..70d305e --- /dev/null +++ b/internal/guest/vminit/boottime/boottime.go @@ -0,0 +1,41 @@ +//go:build linux + +// Package boottime stamps guest boot milestones against CLOCK_BOOTTIME so the +// guest-boot half of cold-start can be decomposed into kernel vs vminitd vs +// host-readiness. +package boottime + +import ( + "context" + "time" + + "github.com/containerd/log" + "golang.org/x/sys/unix" +) + +// Uptime returns time since the kernel started (CLOCK_BOOTTIME). Read at PID1 +// entry it is the kernel boot duration (kernel start -> init exec); read later +// it also includes vminitd's own startup. +func Uptime() time.Duration { + var ts unix.Timespec + if err := unix.ClockGettime(unix.CLOCK_BOOTTIME, &ts); err != nil { + return 0 + } + return time.Duration(ts.Sec)*time.Second + time.Duration(ts.Nsec)*time.Nanosecond +} + +// LogReady emits a grep-able VMINITD_READY line stamping a boot phase against +// CLOCK_BOOTTIME (microseconds since kernel start). Always on - it is a handful +// of lines per boot and is the only clean view of where guest-boot time goes: +// +// pid1-entry - kernel boot: kernel start -> /sbin/vminitd exec +// system-init - + vminitd system.Initialize (mounts, devices, network) +// vsock-listen - + listener bound, before plugin loading +// serve - ttrpc serving (plugin loading done) +// first-accept - host connected (gap from serve = host-readiness lag) +// +// Deltas between phases attribute the cost; the absolute pid1-entry value is the +// kernel number that last_ts_us in the kmsg profile only approximated. +func LogReady(ctx context.Context, phase string) { + log.G(ctx).Infof("VMINITD_READY uptime_us=%d phase=%s", Uptime().Microseconds(), phase) +} diff --git a/internal/guest/vminit/service/service.go b/internal/guest/vminit/service/service.go index c03bcae..401c017 100644 --- a/internal/guest/vminit/service/service.go +++ b/internal/guest/vminit/service/service.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "net" + "sync" cplugins "github.com/containerd/containerd/v2/plugins" "github.com/containerd/log" @@ -17,6 +18,7 @@ import ( "github.com/mdlayher/vsock" "github.com/spin-stack/spinbox/internal/guest/vminit" + "github.com/spin-stack/spinbox/internal/guest/vminit/boottime" "github.com/spin-stack/spinbox/internal/guest/vminit/config" ) @@ -58,6 +60,7 @@ func New(ctx context.Context, cfg *config.ServiceConfig) (Runnable, error) { "cid": cfg.VSockContextID, "port": cfg.RPCPort, }).Info("listening on vsock for RPC connections") + boottime.LogReady(ctx, "vsock-listen") cfg.Shutdown.RegisterCallback(func(ctx context.Context) error { return l.Close() }) @@ -136,10 +139,33 @@ func New(ctx context.Context, cfg *config.ServiceConfig) (Runnable, error) { }, nil } +// firstAcceptListener fires onFirstAccept the first time a connection is +// accepted. Used to stamp VMINITD_READY phase=first-accept: the gap from the +// serve phase is how long the guest waited for the host to connect - the host +// readiness lag, measured guest-side. +type firstAcceptListener struct { + net.Listener + once sync.Once + onFirstAccept func() +} + +func (f *firstAcceptListener) Accept() (net.Conn, error) { + c, err := f.Listener.Accept() + if err == nil && f.onFirstAccept != nil { + f.once.Do(f.onFirstAccept) + } + return c, err +} + // Run starts the TTRPC server and blocks until it exits. func (s *Service) Run(ctx context.Context) error { log.G(ctx).Info("starting TTRPC server") - err := s.server.Serve(ctx, s.l) + boottime.LogReady(ctx, "serve") + l := &firstAcceptListener{ + Listener: s.l, + onFirstAccept: func() { boottime.LogReady(ctx, "first-accept") }, + } + err := s.server.Serve(ctx, l) if err != nil { log.G(ctx).WithError(err).Error("TTRPC server exited with error") } else { diff --git a/internal/guest/vminit/system/kmsg.go b/internal/guest/vminit/system/kmsg.go index b249b70..f9b6375 100644 --- a/internal/guest/vminit/system/kmsg.go +++ b/internal/guest/vminit/system/kmsg.go @@ -36,9 +36,12 @@ type kmsgInitcall struct { // that run before virtio-console registers and that the hvc0 console therefore // never replays. /dev/kmsg holds the full kernel log regardless of console, and // log_buf_len=4M (set in debug-boot) keeps it from wrapping. last_ts_us is the -// timestamp of the last kernel record (~ when the kernel handed off to init), -// i.e. the kernel-side boot duration; the gap between it and sum_us is the -// non-initcall kernel work (decompression, mm/SMP bring-up, ...). +// timestamp of the last kernel record seen at drain time; it is only a rough +// upper bound on kernel boot - driver/handoff messages emitted after PID1 +// starts push it past the real boundary. For the precise kernel-boot number use +// the VMINITD_READY pid1-entry stamp (CLOCK_BOOTTIME at init exec). The gap +// between last_ts_us and sum_us still indicates non-initcall kernel work +// (decompression, mm/SMP bring-up, ...). // // No-op unless boot profiling (spin.profile) is enabled. func DumpKernelBootProfile(ctx context.Context) {