diff --git a/cmd/vminitd/main.go b/cmd/vminitd/main.go index 5f47636..bde10dd 100644 --- a/cmd/vminitd/main.go +++ b/cmd/vminitd/main.go @@ -111,6 +111,11 @@ func run(ctx context.Context, cfg *config.ServiceConfig) error { } prof.Mark(ctx, "service-new") + // Boot profiling: dump the complete kernel initcall profile from /dev/kmsg + // (includes the early ACPI/PCI initcalls the hvc0 console hides). No-op + // unless spin.profile is set. + system.DumpKernelBootProfile(ctx) + log.G(ctx).WithField("t", time.Since(t1)).Debug("initialized vminitd") // Note: The supervisor agent is now started after container creation diff --git a/integration/kmsg_profile_test.go b/integration/kmsg_profile_test.go new file mode 100644 index 0000000..608a931 --- /dev/null +++ b/integration/kmsg_profile_test.go @@ -0,0 +1,155 @@ +//go:build linux && integration + +package integration + +import ( + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "testing" + "time" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/containerd/v2/pkg/cio" + "github.com/containerd/containerd/v2/pkg/namespaces" + "github.com/containerd/containerd/v2/pkg/oci" +) + +// kmsgSummaryRE matches the KMSG_PROFILE header line vminitd emits, e.g.: +// +// KMSG_PROFILE initcalls=678 sum_us=83210 last_ts_us=152331 +var kmsgSummaryRE = regexp.MustCompile(`KMSG_PROFILE initcalls=(\d+) sum_us=(\d+) last_ts_us=(\d+)`) + +// kmsgEntryRE matches a per-initcall KMSG_PROFILE line, e.g.: +// +// KMSG_PROFILE 39000 us acpi_init+0x0/0x460 +var kmsgEntryRE = regexp.MustCompile(`KMSG_PROFILE\s+(\d+) us\s+(\S+)`) + +// TestKernelBootProfileComplete boots a VM with the debug-boot annotation and +// reads the COMPLETE kernel initcall profile that vminitd dumps from /dev/kmsg. +// Unlike TestKernelBootProfile (which parses the hvc0 console and so only sees +// the late initcalls), this includes the early core/subsys initcalls - the +// suspected ACPI/PCI cost in the guest-boot half of cold-start. last_ts_us is +// the kernel-side boot duration; sum_us is the initcall total, and the gap +// between them is the non-initcall kernel work. +func TestKernelBootProfileComplete(t *testing.T) { + cfg := loadTestConfig() + + client := setupContainerdClient(t, cfg) + defer client.Close() + + ensureImagePulled(t, client, cfg) + + ctx := namespaces.WithNamespace(t.Context(), cfg.Namespace) + + image, err := client.GetImage(ctx, cfg.Image) + if err != nil { + t.Fatalf("get image %s: %v", cfg.Image, err) + } + + name := "qbx-kmsg-" + strings.ReplaceAll(time.Now().Format("150405.000"), ".", "") + + container, err := client.NewContainer(ctx, name, + containerd.WithSnapshotter(cfg.Snapshotter), + containerd.WithImage(image), + containerd.WithNewSnapshot(name+"-snapshot", image), + containerd.WithRuntime(cfg.Runtime, nil), + containerd.WithNewSpec( + oci.WithImageConfig(image), + oci.WithAnnotations(map[string]string{annotationDebugBoot: "true"}), + oci.WithProcessArgs("/bin/echo", "BOOTED"), + ), + ) + if err != nil { + t.Fatalf("create container %s: %v", name, err) + } + defer func() { + if err := container.Delete(ctx, containerd.WithSnapshotCleanup); err != nil { + t.Logf("cleanup container %s: %v", name, err) + } + }() + + task, err := container.NewTask(ctx, cio.NullIO) + if err != nil { + t.Fatalf("create task for %s: %v", name, err) + } + defer func() { + if _, err := task.Delete(ctx, containerd.WithProcessKill); err != nil { + if !strings.Contains(err.Error(), "ttrpc: closed") { + t.Logf("cleanup task for %s: %v", name, err) + } + } + }() + + if _, err := task.Wait(ctx); err != nil { + t.Fatalf("wait for task %s: %v", name, err) + } + if err := task.Start(ctx); err != nil { + t.Fatalf("start task %s: %v", name, err) + } + waitForTaskStatus(t, ctx, task, containerd.Running) + + consolePath := filepath.Join(logDirBase(), name, "console.log") + header, entries := waitForKmsgProfile(t, consolePath, 30*time.Second) + + t.Logf("KMSG_PROFILE initcalls=%d sum_us=%d (%.1f ms) last_ts_us=%d (%.1f ms kernel-to-init)", + header.initcalls, header.sumUS, float64(header.sumUS)/1000, + header.lastTSUS, float64(header.lastTSUS)/1000) + gap := header.lastTSUS - header.sumUS + t.Logf("KMSG_PROFILE non-initcall kernel work ~= %d us (%.1f ms)", gap, float64(gap)/1000) + for _, e := range entries { + t.Logf("KMSG_PROFILE %6d us %s", e.usec, e.name) + } +} + +type kmsgHeader struct { + initcalls int + sumUS int + lastTSUS int +} + +type kmsgEntry struct { + usec int + name string +} + +// waitForKmsgProfile polls the console log until vminitd's KMSG_PROFILE summary +// appears and returns the header plus the per-initcall entries. +func waitForKmsgProfile(t *testing.T, consolePath string, timeout time.Duration) (kmsgHeader, []kmsgEntry) { + t.Helper() + deadline := time.Now().Add(timeout) + for { + data, _ := os.ReadFile(consolePath) //nolint:errcheck // best-effort poll + text := string(data) + if m := kmsgSummaryRE.FindStringSubmatch(text); m != nil { + h := kmsgHeader{ + initcalls: atoiOr0(m[1]), + sumUS: atoiOr0(m[2]), + lastTSUS: atoiOr0(m[3]), + } + var entries []kmsgEntry + for _, e := range kmsgEntryRE.FindAllStringSubmatch(text, -1) { + usec := atoiOr0(e[1]) + // The summary line also starts with KMSG_PROFILE but has no + // " us" entry shape, so the entry regex naturally skips it. + entries = append(entries, kmsgEntry{usec: usec, name: e[2]}) + } + return h, entries + } + if time.Now().After(deadline) { + t.Fatalf("no KMSG_PROFILE output in %s after %v (debug-boot honored? spin.profile set? initcall_debug on?); console tail:\n%s", + consolePath, timeout, lastLines(text, 20)) + } + time.Sleep(200 * time.Millisecond) + } +} + +func atoiOr0(s string) int { + n, err := strconv.Atoi(s) + if err != nil { + return 0 + } + return n +} diff --git a/internal/guest/vminit/system/kmsg.go b/internal/guest/vminit/system/kmsg.go new file mode 100644 index 0000000..b249b70 --- /dev/null +++ b/internal/guest/vminit/system/kmsg.go @@ -0,0 +1,164 @@ +//go:build linux + +package system + +import ( + "context" + "errors" + "regexp" + "sort" + "strconv" + "strings" + + "github.com/containerd/log" + "golang.org/x/sys/unix" +) + +// kmsgInitcallRE matches an initcall_debug completion line embedded in a +// /dev/kmsg record message, e.g.: +// +// initcall pci_subsys_init+0x0/0x40 returned 0 after 12345 usecs +var kmsgInitcallRE = regexp.MustCompile(`initcall (\S+) returned \S+ after (\d+) usecs`) + +// kmsgTopN bounds how many of the slowest initcalls we print. +const kmsgTopN = 25 + +type kmsgInitcall struct { + name string + usec int +} + +// DumpKernelBootProfile reads the kernel ring buffer (/dev/kmsg) and emits a +// compact KMSG_PROFILE summary of the boot's initcalls. +// +// Unlike the console-based kernel profile, this sees the *complete* set - +// including the early core/subsys initcalls (acpi_init, pci_subsys_init, ...) +// that run before virtio-console registers and that the hvc0 console therefore +// never replays. /dev/kmsg holds the full kernel log regardless of console, and +// log_buf_len=4M (set in debug-boot) keeps it from wrapping. last_ts_us is the +// timestamp of the last kernel record (~ when the kernel handed off to init), +// i.e. the kernel-side boot duration; the gap between it and sum_us is the +// non-initcall kernel work (decompression, mm/SMP bring-up, ...). +// +// No-op unless boot profiling (spin.profile) is enabled. +func DumpKernelBootProfile(ctx context.Context) { + if !profileEnabled() { + return + } + + calls, lastTS, err := readKmsgInitcalls() + if err != nil { + log.G(ctx).WithError(err).Warn("kmsg boot profile: read failed") + return + } + if len(calls) == 0 { + log.G(ctx).Warn("kmsg boot profile: no initcall_debug records (is initcall_debug set?)") + return + } + + sum := 0 + for _, c := range calls { + sum += c.usec + } + sort.Slice(calls, func(i, j int) bool { return calls[i].usec > calls[j].usec }) + + log.G(ctx).Infof("KMSG_PROFILE initcalls=%d sum_us=%d last_ts_us=%d", len(calls), sum, lastTS) + for i, c := range calls { + if i >= kmsgTopN { + break + } + log.G(ctx).Infof("KMSG_PROFILE %8d us %s", c.usec, c.name) + } +} + +// readKmsgInitcalls reads all currently-buffered /dev/kmsg records and returns +// the initcall timings plus the highest record timestamp seen (microseconds +// since boot). +func readKmsgInitcalls() ([]kmsgInitcall, int64, error) { + // O_NONBLOCK so Read returns EAGAIN once we have drained the buffer instead + // of blocking for future messages. Raw unix.Read (not os.File) avoids the Go + // runtime poller, which would otherwise wait on EAGAIN. + fd, err := unix.Open("/dev/kmsg", unix.O_RDONLY|unix.O_NONBLOCK|unix.O_CLOEXEC, 0) + if err != nil { + return nil, 0, err + } + defer func() { _ = unix.Close(fd) }() + + var ( + calls []kmsgInitcall + lastTS int64 + buf = make([]byte, 8192) // the kernel returns one record per read + ) + + for { + n, err := unix.Read(fd, buf) + if err != nil { + switch { + case errors.Is(err, unix.EAGAIN), errors.Is(err, unix.EWOULDBLOCK): + // Drained: caught up to the end of the buffer. + return calls, lastTS, nil + case errors.Is(err, unix.EINTR): + continue + case errors.Is(err, unix.EPIPE): + // A record was overwritten between reads; the position has moved + // on, so keep reading. + continue + default: + return calls, lastTS, err + } + } + if n == 0 { + return calls, lastTS, nil + } + + ts, msg, ok := parseKmsgRecord(string(buf[:n])) + if !ok { + continue + } + if ts > lastTS { + lastTS = ts + } + if name, usec, ok := extractInitcall(msg); ok { + calls = append(calls, kmsgInitcall{name: name, usec: usec}) + } + } +} + +// parseKmsgRecord splits one /dev/kmsg record into its timestamp (microseconds) +// and message text. The record format is: +// +// ,,,[,...];\n[ \t...] +// +// ok is false only when there is no ';' separating header from message. +func parseKmsgRecord(record string) (tsUS int64, msg string, ok bool) { + semi := strings.IndexByte(record, ';') + if semi < 0 { + return 0, "", false + } + + msg = record[semi+1:] + if nl := strings.IndexByte(msg, '\n'); nl >= 0 { + msg = msg[:nl] // drop the trailing newline and any continuation lines + } + + fields := strings.Split(record[:semi], ",") + if len(fields) >= 3 { + if ts, err := strconv.ParseInt(fields[2], 10, 64); err == nil { + tsUS = ts + } + } + return tsUS, msg, true +} + +// extractInitcall pulls the initcall name and duration from a record message. +func extractInitcall(msg string) (name string, usec int, ok bool) { + m := kmsgInitcallRE.FindStringSubmatch(msg) + if m == nil { + return "", 0, false + } + usec, err := strconv.Atoi(m[2]) + if err != nil { + return "", 0, false + } + return m[1], usec, true +} diff --git a/internal/guest/vminit/system/kmsg_test.go b/internal/guest/vminit/system/kmsg_test.go new file mode 100644 index 0000000..09b09b7 --- /dev/null +++ b/internal/guest/vminit/system/kmsg_test.go @@ -0,0 +1,76 @@ +//go:build linux + +package system + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseKmsgRecord(t *testing.T) { + tests := []struct { + name string + record string + wantTS int64 + wantMsg string + wantOK bool + }{ + { + name: "initcall record", + record: "6,123,187940,-;initcall pci_subsys_init+0x0/0x40 returned 0 after 12345 usecs", + wantTS: 187940, + wantMsg: "initcall pci_subsys_init+0x0/0x40 returned 0 after 12345 usecs", + wantOK: true, + }, + { + name: "trailing newline and continuation dropped", + record: "5,80,5085,-;EXT4-fs (vdb): mounted\n SUBSYSTEM=block\n DEVICE=b259:0", + wantTS: 5085, + wantMsg: "EXT4-fs (vdb): mounted", + wantOK: true, + }, + { + name: "no semicolon", + record: "garbage without separator", + wantOK: false, + }, + { + name: "header too short still yields message", + record: "6;some message", + wantTS: 0, + wantMsg: "some message", + wantOK: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ts, msg, ok := parseKmsgRecord(tt.record) + assert.Equal(t, tt.wantOK, ok) + if tt.wantOK { + assert.Equal(t, tt.wantTS, ts) + assert.Equal(t, tt.wantMsg, msg) + } + }) + } +} + +func TestExtractInitcall(t *testing.T) { + t.Run("match", func(t *testing.T) { + name, usec, ok := extractInitcall("initcall acpi_init+0x0/0x460 returned 0 after 39000 usecs") + assert.True(t, ok) + assert.Equal(t, "acpi_init+0x0/0x460", name) + assert.Equal(t, 39000, usec) + }) + + t.Run("non-initcall message", func(t *testing.T) { + _, _, ok := extractInitcall("erofs (device vda): mounted with root inode @ nid 60") + assert.False(t, ok) + }) + + t.Run("calling line is not a completion", func(t *testing.T) { + _, _, ok := extractInitcall("calling acpi_init+0x0/0x460 @ 1") + assert.False(t, ok) + }) +}