From 62fc02c15f47b49495733e7c0deb88118e5c0732 Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Wed, 22 Apr 2026 11:19:42 +0530 Subject: [PATCH 1/2] =?UTF-8?q?guest:=20unify=20pod=20model=20=E2=80=94=20?= =?UTF-8?q?replace=20VirtualPod=20with=20single=20pod=20abstraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the separate VirtualPod tracking (dedicated type, 7 exported methods, parent cgroup manager, reverse-lookup map) with a unified uvmPod type and a single pods map on Host. All pod types (V1 sandbox, virtual pod, V2 shim) now go through the same code path: - createPodInUVM allocates a cgroup under /pods/{sandboxID} - addContainerToPod tracks container→pod membership - RemoveContainer handles cleanup uniformly Cgroup hierarchy changes from: /containers/{id} (V1 sandbox) /containers/virtual-pods/{virtualPodID} (virtual pod) to: /pods/{sandboxID} (all pod types) Workload containers nest under their pod: /pods/{sandboxID}/{containerID} Signed-off-by: Shreyansh Jain Signed-off-by: Shreyansh Sancheti --- cmd/gcs/main.go | 34 +- internal/guest/runtime/hcsv2/container.go | 3 +- .../runtime/hcsv2/container_stats_test.go | 13 - .../guest/runtime/hcsv2/sandbox_container.go | 10 +- .../runtime/hcsv2/standalone_container.go | 8 +- internal/guest/runtime/hcsv2/uvm.go | 352 +++++------------- .../guest/runtime/hcsv2/workload_container.go | 15 +- test/gcs/cgroup_test.go | 4 +- 8 files changed, 130 insertions(+), 309 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 68ab28d499..d71038a942 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -80,8 +80,8 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre count++ var msg string - if strings.HasPrefix(cgName, "/virtual-pods") { - msg = "memory usage for virtual pods cgroup exceeded threshold" + if strings.HasPrefix(cgName, "/pods") { + msg = "memory usage for pods cgroup exceeded threshold" } else { msg = "memory usage for cgroup exceeded threshold" } @@ -327,7 +327,7 @@ func main() { // Setup the UVM cgroups to protect against a workload taking all available // memory and causing the GCS to malfunction we create cgroups: gcs, - // containers, and virtual-pods for multi-pod support. + // containers, and pods for pod support. // // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy @@ -359,17 +359,17 @@ func main() { } defer containersControl.Delete() //nolint:errcheck - // Create virtual-pods cgroup hierarchy for multi-pod support - // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} - virtualPodsControl, err := cgroup.NewManager("/containers/virtual-pods", &oci.LinuxResources{ + // Create pods cgroup hierarchy for pod support + // This will be the parent for all pod cgroups: /pods/{sandboxID} + podsControl, err := cgroup.NewManager("/pods", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ - Limit: &containersLimit, // Share the same limit as containers + Limit: &containersLimit, }, }) if err != nil { - logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + logrus.WithError(err).Fatal("failed to create pods cgroup") } - defer virtualPodsControl.Delete() //nolint:errcheck + defer podsControl.Delete() //nolint:errcheck gcsControl, err := cgroup.NewManager("/gcs", &oci.LinuxResources{}) if err != nil { @@ -391,10 +391,6 @@ func main() { EnableV4: *v4, } h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - // Initialize virtual pod support in the host - if err := h.InitializeVirtualPodSupport(virtualPodsControl); err != nil { - logrus.WithError(err).Warn("Virtual pod support initialization failed") - } b.AssignHandlers(mux, h) var bridgeIn io.ReadCloser @@ -430,13 +426,13 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() - // Setup OOM monitoring for virtual-pods cgroup - virtualPodsOom, err := virtualPodsControl.OOMEventFD() + // Setup OOM monitoring for pods cgroup + podsOom, err := podsControl.OOMEventFD() if err != nil { - logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + logrus.WithError(err).Fatal("failed to retrieve the pods cgroups oom eventfd") } - virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") - defer virtualPodsOomFile.Close() + podsOomFile := os.NewFile(podsOom, "pods-oomfd") + defer podsOomFile.Close() // time synchronization service if !(*disableTimeSync) { @@ -447,7 +443,7 @@ func main() { go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) - go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) + go readMemoryEvents(startTime, podsOomFile, "/pods", containersLimit, podsControl) err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 0ce930dd65..67d84549af 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -44,7 +44,8 @@ const ( ) type Container struct { - id string + id string + sandboxID string vsock transport.Transport logPath string // path to [logFile]. diff --git a/internal/guest/runtime/hcsv2/container_stats_test.go b/internal/guest/runtime/hcsv2/container_stats_test.go index c1986ca2b1..fa44a4451a 100644 --- a/internal/guest/runtime/hcsv2/container_stats_test.go +++ b/internal/guest/runtime/hcsv2/container_stats_test.go @@ -494,16 +494,3 @@ func TestConvertV2StatsToV1_NilInput(t *testing.T) { t.Error("ConvertV2StatsToV1(nil) should return empty metrics with all nil fields") } } - -func TestHost_InitializeVirtualPodSupport_ErrorCases(t *testing.T) { - host := &Host{} - - // Test with nil input - err := host.InitializeVirtualPodSupport(nil) - if err == nil { - t.Error("Expected error for nil input") - } - if err != nil && err.Error() != "no valid cgroup manager provided for virtual pod support" { - t.Errorf("Unexpected error message: %s", err.Error()) - } -} diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 12590f4dc0..37a68b1732 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -119,14 +119,8 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Set cgroup path - check if this is a virtual pod - if virtualSandboxID != "" { - // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID - } else { - // Traditional sandbox goes under /containers - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 768adda344..742a3716cd 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -15,7 +15,6 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/oc" - "github.com/Microsoft/hcsshim/pkg/annotations" ) func getStandaloneHostnamePath(rootDir string) string { @@ -119,12 +118,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec } // Set cgroup path - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - if virtualSandboxID != "" { - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id - } else { - spec.Linux.CgroupsPath = "/containers/" + id - } + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index bbec0b7564..3f41bff5b0 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -73,15 +73,14 @@ func checkValidContainerID(id string, idType string) error { return errors.Errorf("invalid %s id: %s (must match %s)", idType, id, validContainerIDRegex.String()) } -// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods -type VirtualPod struct { - VirtualSandboxID string - MasterSandboxID string - NetworkNamespace string - CgroupPath string - CgroupControl cgroup.Manager // Unified cgroup manager (v1 or v2) - Containers map[string]bool // containerID -> exists - CreatedAt time.Time +// uvmPod tracks pod-level state within the UVM. +type uvmPod struct { + sandboxID string + networkNamespace string + cgroupPath string + cgroupControl cgroup.Manager + containers map[string]bool + createdAt time.Time } // Host is the structure tracking all UVM host state including all containers @@ -93,16 +92,14 @@ type Host struct { externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Virtual pod support for multi-pod scenarios - virtualPodsMutex sync.Mutex - virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod - containerToVirtualPod map[string]string // containerID -> virtualSandboxID - virtualPodsCgroupManager cgroup.Manager // Parent cgroup for all virtual pods + // pods tracks all pod state. Guarded by podsMutex. + podsMutex sync.Mutex + pods map[string]*uvmPod // sandboxRoots maps sandboxID to the resolved sandbox root directory. // Populated via registerSandboxRoot during sandbox creation using // the host-provided OCIBundlePath as source of truth. - // Lock ordering: containersMutex -> sandboxRootsMutex (never reverse). + // Lock ordering: containersMutex -> podsMutex -> sandboxRootsMutex (never reverse). sandboxRootsMutex sync.RWMutex sandboxRoots map[string]string @@ -126,16 +123,15 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s logWriter, ) return &Host{ - containers: make(map[string]*Container), - externalProcesses: make(map[int]*externalProcess), - virtualPods: make(map[string]*VirtualPod), - containerToVirtualPod: make(map[string]string), - sandboxRoots: make(map[string]string), - rtime: rtime, - vsock: vsock, - devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), - securityOptions: securityPolicyOptions, + containers: make(map[string]*Container), + externalProcesses: make(map[int]*externalProcess), + pods: make(map[string]*uvmPod), + sandboxRoots: make(map[string]string), + rtime: rtime, + vsock: vsock, + devNullTransport: &transport.DevNullTransport{}, + hostMounts: newHostMounts(), + securityOptions: securityPolicyOptions, } } @@ -214,26 +210,45 @@ func (h *Host) RemoveContainer(id string) { return } - // Check if this container is part of a virtual pod - virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] - if isVirtualPod { - // Remove from virtual pod tracking - h.RemoveContainerFromVirtualPod(id) - // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - }).Info("Container removed from virtual pod") - } else { - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + + // Do NOT call RemoveNetworkNamespace for virtual pod sandbox containers. + // The host-driven teardown path (TearDownNetworking → RemoveNetNS → removeNIC) + // removes adapters first and then the namespace. Calling it here would fail + // with "contains adapters" because the host hasn't removed them yet. + virtualPodID := c.spec.Annotations[annotations.VirtualPodID] + isVirtualPodSandbox := virtualPodID != "" && id == virtualPodID + if !isVirtualPodSandbox && (!isCRI || criType == "sandbox") { + if err := RemoveNetworkNamespace(context.Background(), id); err != nil { + logrus.WithError(err).WithField(logfields.ContainerID, id).Warn("failed to remove network namespace") } } delete(h.containers, id) + // Extract pod cgroup manager under lock, delete cgroup outside lock to + // avoid holding podsMutex during filesystem I/O. + var cgToDelete cgroup.Manager + h.podsMutex.Lock() + if c.sandboxID != "" { + if pod, exists := h.pods[c.sandboxID]; exists { + delete(pod.containers, id) + if id == c.sandboxID { + cgToDelete = pod.cgroupControl + delete(h.pods, c.sandboxID) + } + } + } + h.podsMutex.Unlock() + + if cgToDelete != nil { + if err := cgToDelete.Delete(); err != nil { + logrus.WithFields(logrus.Fields{ + "sandboxID": c.sandboxID, + }).WithError(err).Warn("failed to delete pod cgroup") + } + } + // Clean up the sandbox root mapping for sandbox containers. if c.isSandbox { h.unregisterSandboxRoot(id) @@ -377,7 +392,8 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] // Check for virtual pod annotation - virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] + virtualPodID := settings.OCISpecification.Annotations[annotations.VirtualPodID] + isVirtualPod := virtualPodID != "" if h.HasSecurityPolicy() { if err = checkValidContainerID(id, "container"); err != nil { @@ -424,35 +440,18 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - // Handle virtual pod logic - if isVirtualPod && isCRI { - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - "criType": criType, - }).Info("Processing container for virtual pod") - - if criType == "sandbox" { - // This is a virtual pod sandbox - create the virtual pod if it doesn't exist - if _, exists := h.GetVirtualPod(virtualPodID); !exists { - // Use the network namespace ID from the current container spec - // Virtual pods share the same network namespace - networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) - if networkNamespace == "" { - networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) - } - - if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, settings.OCISpecification); err != nil { - return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) - } - } - } - - // Add this container to the virtual pod - if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { - return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) + // Determine the sandboxID for this container. + sandboxID := id + if criType == "container" { + sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] + if sid == "" { + return nil, errors.Errorf("workload container missing sandbox ID annotation") } + sandboxID = sid + } else if virtualPodID != "" { + sandboxID = virtualPodID } + c.sandboxID = sandboxID // Normally we would be doing policy checking here at the start of our // "policy gated function". However, we can't for create container as we @@ -462,8 +461,6 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM // container id var namespaceID string - // for sandbox container sandboxID is same as container id - sandboxID := id if isCRI { switch criType { case "sandbox": @@ -503,9 +500,12 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if err := securitypolicy.ExtendPolicyWithNetworkingMounts(id, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } + + if err := h.createPodInUVM(sandboxID, settings.OCISpecification, namespaceID); err != nil { + return nil, err + } case "container": sid, ok := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] - sandboxID = sid if h.HasSecurityPolicy() { if err = checkValidContainerID(sid, "sandbox"); err != nil { return nil, err @@ -559,6 +559,11 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } } + // Register container with its pod for CRI containers. + if isCRI { + h.addContainerToPod(sandboxID, id) + } + // don't specialize tee logs (both files and mounts) just for workload containers // add log directory mount before enforcing (mount) policy if logDirMount := settings.OCISpecification.Annotations[annotations.LCOWTeeLogDirMount]; logDirMount != "" { @@ -1489,194 +1494,43 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) return oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) } -// Virtual Pod Management Methods - -// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods -func (h *Host) InitializeVirtualPodSupport(mgr cgroup.Manager) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - if mgr == nil { - return errors.New("no valid cgroup manager provided for virtual pod support") - } - - h.virtualPodsCgroupManager = mgr - logrus.Info("Virtual pod support initialized") - return nil -} - -// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace -func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, pSpec *specs.Spec) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod already exists - if _, exists := h.virtualPods[virtualSandboxID]; exists { - return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) - } - - // Extract resource limits if provided +// createPodInUVM allocates a cgroup for a pod and registers it in the host. +func (h *Host) createPodInUVM(sid string, pSpec *specs.Spec, nsID string) error { + cgroupPath := path.Join("/pods", sid) resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Creating virtual pod with specified resources") - } else { - logrus.WithField(logfields.VirtualSandboxID, virtualSandboxID).Info("Creating pod cgroup with default resources as none were specified") } - - if h.virtualPodsCgroupManager == nil { - return fmt.Errorf("no virtual pod cgroup manager available") - } - - cgroupPath := path.Join("/containers/virtual-pods", virtualSandboxID) cgroupControl, err := cgroup.NewManager(cgroupPath, resources) if err != nil { - return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) - } - logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod cgroup") - - // Create virtual pod structure - virtualPod := &VirtualPod{ - VirtualSandboxID: virtualSandboxID, - MasterSandboxID: masterSandboxID, - NetworkNamespace: networkNamespace, - CgroupPath: cgroupPath, - CgroupControl: cgroupControl, - Containers: make(map[string]bool), - CreatedAt: time.Now(), - } - - h.virtualPods[virtualSandboxID] = virtualPod - - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - "masterSandboxID": masterSandboxID, - "cgroupPath": cgroupPath, - "networkNamespace": networkNamespace, - }).Info("Virtual pod created successfully") - - return nil -} - -// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) -func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { - return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) -} - -// GetVirtualPod retrieves a virtual pod by its virtualSandboxID -func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - vp, exists := h.virtualPods[virtualSandboxID] - return vp, exists -} - -// AddContainerToVirtualPod associates a container with a virtual pod -func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod exists - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) + return fmt.Errorf("failed to create cgroup for pod %s: %w", sid, err) + } + h.podsMutex.Lock() + defer h.podsMutex.Unlock() + if _, exists := h.pods[sid]; exists { + _ = cgroupControl.Delete() + return fmt.Errorf("pod %s already exists", sid) + } + h.pods[sid] = &uvmPod{ + sandboxID: sid, + networkNamespace: nsID, + cgroupPath: cgroupPath, + cgroupControl: cgroupControl, + containers: make(map[string]bool), + createdAt: time.Now(), } - - // Add container to virtual pod - vp.Containers[containerID] = true - h.containerToVirtualPod[containerID] = virtualSandboxID - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: containerID, - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Container added to virtual pod") - + "sandboxID": sid, + "cgroupPath": cgroupPath, + }).Info("pod created in UVM") return nil } -// RemoveContainerFromVirtualPod removes a container from a virtual pod -func (h *Host) RemoveContainerFromVirtualPod(containerID string) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - virtualSandboxID, exists := h.containerToVirtualPod[containerID] - if !exists { - return // Container not in any virtual pod - } - - ctx, entry := log.SetEntry(context.Background(), logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - logfields.ContainerID: containerID, - }) - - // Remove from virtual pod - if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { - delete(vp.Containers, containerID) - - // If this is the sandbox container, clean up the network namespace adapters first, - // then remove the namespace. - - // Do NOT call [RemoveNetworkNamespace] here. - // When cleaning up the virtual pod's sandbox container on the host, - // [UtilityVM.TearDownNetworking] calls [UtilityVM.RemoveNetNS], - // which, via [UtilityVM.removeNIC], removes the NICs from the uVM and - // sends [guestresource.ResourceTypeNetwork] RPCs that ultimately cal [modifyNetwork] - // to remove adapters from guest tracking. - // However, that happens AFTER this function runs. - // Calling [RemoveNetworkNamespace] here would - // fail with "contains adapters" since the host hasn't removed them yet. - // - // The host-driven path handles cleanup in the correct order. - if containerID == virtualSandboxID && vp.NetworkNamespace != "" { - entry.WithField(logfields.NamespaceID, vp.NetworkNamespace).Debug("skipping virtual pod network namespace removal") - } - - // If this was the last container, cleanup the virtual pod - if len(vp.Containers) == 0 { - h.cleanupVirtualPod(ctx, virtualSandboxID) - } - } - - delete(h.containerToVirtualPod, containerID) - - entry.Info("Container removed from virtual pod") -} - -func (h *Host) cleanupVirtualPod(ctx context.Context, virtualSandboxID string) { - // logfields set on [Host.RemoveContainerFromVirtualPod] - entry := log.G(ctx) - - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - entry.Warn("attempted to cleanup non-existent virtual sandbox pod") - return // virtual pod does not exist - } - - // Delete the cgroup - if vp.CgroupControl != nil { - if err := vp.CgroupControl.Delete(); err != nil { - entry.WithError(err).Warn("failed to delete virtual pod cgroup") - } - } - - // NOTE: Do NOT call [RemoveNetworkNamespace] here. - // See comment in [Host.RemoveContainerFromVirtualPod] for more info. - - // Clean up the virtual pod root directory which contains hostname, hosts, - // resolv.conf, and logs/. These are NOT cleaned by Container.Delete() because - // they live under /run/gcs/c/virtual-pods// which is separate from the - // container's ociBundlePath (/run/gcs/c//). - if vpRootDir := specGuest.VirtualPodRootDir(virtualSandboxID); vpRootDir != "" { - if err := os.RemoveAll(vpRootDir); err != nil { - entry.WithField("path", vpRootDir).WithError(err).Warn("Failed to remove virtual pod root directory") - } else { - entry.WithField("path", vpRootDir).Debug("Removed virtual pod root directory") - } +// addContainerToPod registers a container as belonging to a pod. +func (h *Host) addContainerToPod(sandboxID, containerID string) { + h.podsMutex.Lock() + defer h.podsMutex.Unlock() + if pod, exists := h.pods[sandboxID]; exists { + pod.containers[containerID] = true } - - delete(h.virtualPods, virtualSandboxID) - entry.Info("Virtual pod cleaned up") } diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index d118a4d35c..633d8b2498 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -224,17 +224,12 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id, sandboxRoot strin } } - // Check if this is a virtual pod container - virtualPodID := spec.Annotations[annotations.VirtualPodID] - - // Set cgroup path - check if this is a virtual pod container - if virtualPodID != "" { - // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id - } else { - // Regular containers go under /containers - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path under the sandbox's pod cgroup. + sandboxID := spec.Annotations[annotations.KubernetesSandboxID] + if sandboxID == "" { + sandboxID = id } + spec.Linux.CgroupsPath = "/pods/" + sandboxID + "/" + id if spec.Windows != nil { // we only support Nvidia gpus right now diff --git a/test/gcs/cgroup_test.go b/test/gcs/cgroup_test.go index 72b84808bf..71ee644a01 100644 --- a/test/gcs/cgroup_test.go +++ b/test/gcs/cgroup_test.go @@ -435,8 +435,8 @@ func TestOOMEventFD_ContainerKill(t *testing.T) { // --- Set up OOMEventFD on the container's cgroup --- // - // Standalone containers get cgroup path "/containers/". - cgroupPath := "/containers/" + id + // Standalone containers get cgroup path "/pods/". + cgroupPath := "/pods/" + id // LoadManager works for both v1 and v2 — the cgroup already exists (created by runc). mgr, err := cgroup.LoadManager(cgroupPath) From ca79d20fa8a3e231ac94594ef64f43a21d870b1a Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Wed, 22 Apr 2026 12:43:30 +0530 Subject: [PATCH 2/2] guest/spec: remove VirtualPod path helpers and dead code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove 13 VirtualPod-specific path functions from spec.go that became dead code after the pod unification in the parent commit. All callers now use the *FromRoot variants introduced in #2653. Also removes SandboxLogsDir and SandboxLogPath — both were only reachable through VirtualPodAwareSandboxRootDir which is now gone. Updates ExtendPolicyWithNetworkingMounts in pkg/securitypolicy to accept a sandboxRoot string and use GenerateWorkloadContainerNetworkMountsFromRoot. Signed-off-by: Shreyansh Jain Signed-off-by: Shreyansh Sancheti --- .../guest/runtime/hcsv2/sandbox_root_test.go | 32 ++-- internal/guest/runtime/hcsv2/uvm.go | 6 +- internal/guest/spec/spec.go | 142 ------------------ pkg/securitypolicy/securitypolicy_linux.go | 8 +- 4 files changed, 21 insertions(+), 167 deletions(-) diff --git a/internal/guest/runtime/hcsv2/sandbox_root_test.go b/internal/guest/runtime/hcsv2/sandbox_root_test.go index 0c22321d74..b60ae3bff5 100644 --- a/internal/guest/runtime/hcsv2/sandbox_root_test.go +++ b/internal/guest/runtime/hcsv2/sandbox_root_test.go @@ -102,15 +102,15 @@ func TestVirtualPodRootFromOCIBundlePath(t *testing.T) { func TestVirtualPodRootMatchesLegacy(t *testing.T) { // When OCIBundlePath uses the legacy prefix, the derived virtual pod root - // must match what VirtualPodRootDir() would have produced. + // must match the expected path under the legacy prefix. ociBundlePath := "/run/gcs/c/container-id" virtualPodID := "vpod-abc" derived := filepath.Join(filepath.Dir(ociBundlePath), "virtual-pods", virtualPodID) - legacy := specGuest.VirtualPodRootDir(virtualPodID) + expected := "/run/gcs/c/virtual-pods/vpod-abc" - if derived != legacy { - t.Fatalf("derived %q != legacy %q — backwards compatibility broken", derived, legacy) + if derived != expected { + t.Fatalf("derived %q != expected %q — backwards compatibility broken", derived, expected) } } @@ -127,8 +127,8 @@ func TestSubdirectoryPaths(t *testing.T) { if specGuest.SandboxHugePagesMountsDirFromRoot(sandboxRoot) != specGuest.HugePagesMountsDir("sandbox-xyz") { t.Fatal("SandboxHugePagesMountsDirFromRoot doesn't match legacy HugePagesMountsDir") } - if specGuest.SandboxLogsDirFromRoot(sandboxRoot) != specGuest.SandboxLogsDir("sandbox-xyz", "") { - t.Fatal("SandboxLogsDirFromRoot doesn't match legacy SandboxLogsDir") + if specGuest.SandboxLogsDirFromRoot(sandboxRoot) != filepath.Join(specGuest.SandboxRootDir("sandbox-xyz"), "logs") { + t.Fatal("SandboxLogsDirFromRoot doesn't match expected logs directory") } } @@ -171,12 +171,12 @@ func TestOldVsNewPathParity(t *testing.T) { }, { name: "logs dir", - oldPath: specGuest.SandboxLogsDir(sandboxID, ""), + oldPath: filepath.Join(specGuest.SandboxRootDir(sandboxID), "logs"), newPath: filepath.Join(sandboxRoot, "logs"), }, { name: "log file path", - oldPath: specGuest.SandboxLogPath(sandboxID, "", "container.log"), + oldPath: filepath.Join(specGuest.SandboxRootDir(sandboxID), "logs", "container.log"), newPath: filepath.Join(sandboxRoot, "logs", "container.log"), }, { @@ -213,42 +213,42 @@ func TestOldVsNewPathParity(t *testing.T) { vpodCases := []pathCase{ { name: "virtual pod root", - oldPath: specGuest.VirtualPodRootDir(vpodID), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID), newPath: vpodSandboxRoot, }, { name: "virtual pod sandboxMounts", - oldPath: specGuest.VirtualPodMountsDir(vpodID), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "sandboxMounts"), newPath: filepath.Join(vpodSandboxRoot, "sandboxMounts"), }, { name: "virtual pod tmpfs mounts", - oldPath: specGuest.VirtualPodTmpfsMountsDir(vpodID), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "sandboxTmpfsMounts"), newPath: filepath.Join(vpodSandboxRoot, "sandboxTmpfsMounts"), }, { name: "virtual pod hugepages", - oldPath: specGuest.VirtualPodHugePagesMountsDir(vpodID), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "hugepages"), newPath: filepath.Join(vpodSandboxRoot, "hugepages"), }, { name: "virtual pod logs", - oldPath: specGuest.SandboxLogsDir(sandboxID, vpodID), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "logs"), newPath: filepath.Join(vpodSandboxRoot, "logs"), }, { name: "virtual pod hostname", - oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "hostname"), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "hostname"), newPath: filepath.Join(vpodSandboxRoot, "hostname"), }, { name: "virtual pod hosts", - oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "hosts"), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "hosts"), newPath: filepath.Join(vpodSandboxRoot, "hosts"), }, { name: "virtual pod resolv.conf", - oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "resolv.conf"), + oldPath: filepath.Join("/run/gcs/c", "virtual-pods", vpodID, "resolv.conf"), newPath: filepath.Join(vpodSandboxRoot, "resolv.conf"), }, } diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 3f41bff5b0..846ac60eab 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -497,7 +497,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, err } - if err := securitypolicy.ExtendPolicyWithNetworkingMounts(id, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { + if err := securitypolicy.ExtendPolicyWithNetworkingMounts(sandboxRoot, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } @@ -534,7 +534,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM _ = os.RemoveAll(settings.OCIBundlePath) } }() - if err := securitypolicy.ExtendPolicyWithNetworkingMounts(sandboxID, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { + if err := securitypolicy.ExtendPolicyWithNetworkingMounts(sandboxRoot, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } default: @@ -553,7 +553,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM _ = os.RemoveAll(settings.OCIBundlePath) } }() - if err := securitypolicy.ExtendPolicyWithNetworkingMounts(id, h.securityOptions.PolicyEnforcer, + if err := securitypolicy.ExtendPolicyWithNetworkingMounts(c.sandboxRoot, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } diff --git a/internal/guest/spec/spec.go b/internal/guest/spec/spec.go index b4b9bebf0f..abab02ef59 100644 --- a/internal/guest/spec/spec.go +++ b/internal/guest/spec/spec.go @@ -37,45 +37,6 @@ func networkingMountPaths() []string { } } -// GenerateWorkloadContainerNetworkMounts generates an array of specs.Mount -// required for container networking. Original spec is left untouched and -// it's the responsibility of a caller to update it. -func GenerateWorkloadContainerNetworkMounts(sandboxID string, spec *oci.Spec) []oci.Mount { - var nMounts []oci.Mount - - // In multipod mode, the sandbox writes networking files (resolv.conf, hostname, hosts) - // under the virtual pod root directory. Use VirtualPodAwareSandboxRootDir to ensure - // workload containers mount from the correct path. - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - rootDir := VirtualPodAwareSandboxRootDir(sandboxID, virtualSandboxID) - - logrus.WithFields(logrus.Fields{ - logfields.SandboxID: sandboxID, - logfields.VirtualSandboxID: virtualSandboxID, - "rootDir": rootDir, - }).Info("GenerateWorkloadContainerNetworkMounts: resolved mount source root directory") - - for _, mountPath := range networkingMountPaths() { - // Don't override if the mount is present in the spec - if MountPresent(mountPath, spec.Mounts) { - continue - } - options := []string{"bind"} - if spec.Root != nil && spec.Root.Readonly { - options = append(options, "ro") - } - trimmedMountPath := strings.TrimPrefix(mountPath, "/etc/") - mt := oci.Mount{ - Destination: mountPath, - Type: "bind", - Source: filepath.Join(rootDir, trimmedMountPath), - Options: options, - } - nMounts = append(nMounts, mt) - } - return nMounts -} - // MountPresent checks if mountPath is present in the specMounts array. func MountPresent(mountPath string, specMounts []oci.Mount) bool { for _, m := range specMounts { @@ -91,80 +52,21 @@ func SandboxRootDir(sandboxID string) string { return filepath.Join(guestpath.LCOWRootPrefixInUVM, sandboxID) } -// VirtualPodRootDir returns the virtual pod root directory inside UVM/host. -// This is used when multiple pods share a UVM via virtualSandboxID. -func VirtualPodRootDir(virtualSandboxID string) string { - // Ensure virtualSandboxID is a relative path to prevent directory traversal - sanitizedID := filepath.Clean(virtualSandboxID) - if filepath.IsAbs(sanitizedID) || strings.Contains(sanitizedID, "..") { - return "" - } - return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", sanitizedID) -} - -// VirtualPodAwareSandboxRootDir returns the appropriate root directory based on whether -// the sandbox is part of a virtual pod or traditional single-pod setup. -func VirtualPodAwareSandboxRootDir(sandboxID, virtualSandboxID string) string { - if virtualSandboxID != "" { - return VirtualPodRootDir(virtualSandboxID) - } - return SandboxRootDir(sandboxID) -} - // SandboxMountsDir returns sandbox mounts directory inside UVM/host. func SandboxMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxMounts") } -// VirtualPodMountsDir returns virtual pod mounts directory inside UVM/host. -func VirtualPodMountsDir(virtualSandboxID string) string { - return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxMounts") -} - -// VirtualPodAwareSandboxMountsDir returns the appropriate mounts directory. -func VirtualPodAwareSandboxMountsDir(sandboxID, virtualSandboxID string) string { - if virtualSandboxID != "" { - return VirtualPodMountsDir(virtualSandboxID) - } - return SandboxMountsDir(sandboxID) -} - // SandboxTmpfsMountsDir returns sandbox tmpfs mounts directory inside UVM. func SandboxTmpfsMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "sandboxTmpfsMounts") } -// VirtualPodTmpfsMountsDir returns virtual pod tmpfs mounts directory inside UVM/host. -func VirtualPodTmpfsMountsDir(virtualSandboxID string) string { - return filepath.Join(VirtualPodRootDir(virtualSandboxID), "sandboxTmpfsMounts") -} - -// VirtualPodAwareSandboxTmpfsMountsDir returns the appropriate tmpfs mounts directory. -func VirtualPodAwareSandboxTmpfsMountsDir(sandboxID, virtualSandboxID string) string { - if virtualSandboxID != "" { - return VirtualPodTmpfsMountsDir(virtualSandboxID) - } - return SandboxTmpfsMountsDir(sandboxID) -} - // HugePagesMountsDir returns hugepages mounts directory inside UVM. func HugePagesMountsDir(sandboxID string) string { return filepath.Join(SandboxRootDir(sandboxID), "hugepages") } -// VirtualPodHugePagesMountsDir returns virtual pod hugepages mounts directory. -func VirtualPodHugePagesMountsDir(virtualSandboxID string) string { - return filepath.Join(VirtualPodRootDir(virtualSandboxID), "hugepages") -} - -// VirtualPodAwareHugePagesMountsDir returns the appropriate hugepages directory. -func VirtualPodAwareHugePagesMountsDir(sandboxID, virtualSandboxID string) string { - if virtualSandboxID != "" { - return VirtualPodHugePagesMountsDir(virtualSandboxID) - } - return HugePagesMountsDir(sandboxID) -} - // SandboxMountSource returns sandbox mount path inside UVM. func SandboxMountSource(sandboxID, path string) string { mountsDir := SandboxMountsDir(sandboxID) @@ -172,16 +74,6 @@ func SandboxMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } -// VirtualPodAwareSandboxMountSource returns mount source path for virtual pod aware containers. -func VirtualPodAwareSandboxMountSource(sandboxID, virtualSandboxID, path string) string { - if virtualSandboxID != "" { - mountsDir := VirtualPodMountsDir(virtualSandboxID) - subPath := strings.TrimPrefix(path, guestpath.SandboxMountPrefix) - return filepath.Join(mountsDir, subPath) - } - return SandboxMountSource(sandboxID, path) -} - // SandboxTmpfsMountSource returns sandbox tmpfs mount path inside UVM. func SandboxTmpfsMountSource(sandboxID, path string) string { tmpfsMountDir := SandboxTmpfsMountsDir(sandboxID) @@ -189,16 +81,6 @@ func SandboxTmpfsMountSource(sandboxID, path string) string { return filepath.Join(tmpfsMountDir, subPath) } -// VirtualPodAwareSandboxTmpfsMountSource returns tmpfs mount source path for virtual pod aware containers. -func VirtualPodAwareSandboxTmpfsMountSource(sandboxID, virtualSandboxID, path string) string { - if virtualSandboxID != "" { - mountsDir := VirtualPodTmpfsMountsDir(virtualSandboxID) - subPath := strings.TrimPrefix(path, guestpath.SandboxTmpfsMountPrefix) - return filepath.Join(mountsDir, subPath) - } - return SandboxTmpfsMountSource(sandboxID, path) -} - // HugePagesMountSource returns hugepages mount path inside UVM. func HugePagesMountSource(sandboxID, path string) string { mountsDir := HugePagesMountsDir(sandboxID) @@ -206,30 +88,6 @@ func HugePagesMountSource(sandboxID, path string) string { return filepath.Join(mountsDir, subPath) } -// VirtualPodAwareHugePagesMountSource returns hugepages mount source for virtual pod aware containers. -func VirtualPodAwareHugePagesMountSource(sandboxID, virtualSandboxID, path string) string { - if virtualSandboxID != "" { - mountsDir := VirtualPodHugePagesMountsDir(virtualSandboxID) - subPath := strings.TrimPrefix(path, guestpath.HugePagesMountPrefix) - return filepath.Join(mountsDir, subPath) - } - return HugePagesMountSource(sandboxID, path) -} - -// SandboxLogsDir returns the logs directory inside the UVM for forwarding container stdio to. -// -// Virtual pod aware. -func SandboxLogsDir(sandboxID, virtualSandboxID string) string { - return filepath.Join(VirtualPodAwareSandboxRootDir(sandboxID, virtualSandboxID), "logs") -} - -// SandboxLogPath returns the log path inside the UVM. -// -// Virtual pod aware. -func SandboxLogPath(sandboxID, virtualSandboxID, path string) string { - return filepath.Join(SandboxLogsDir(sandboxID, virtualSandboxID), path) -} - // Root-based path helpers. These accept a pre-resolved sandbox root directory // and are used by the guest-side sandbox root mapping for Shim v2 / multi-pod support. diff --git a/pkg/securitypolicy/securitypolicy_linux.go b/pkg/securitypolicy/securitypolicy_linux.go index 6223522ebe..af85665aef 100644 --- a/pkg/securitypolicy/securitypolicy_linux.go +++ b/pkg/securitypolicy/securitypolicy_linux.go @@ -18,12 +18,8 @@ import ( //nolint:unused const osType = "linux" -func ExtendPolicyWithNetworkingMounts(sandboxID string, enforcer SecurityPolicyEnforcer, spec *oci.Spec) error { - roSpec := &oci.Spec{ - Root: spec.Root, - Annotations: spec.Annotations, - } - networkingMounts := specInternal.GenerateWorkloadContainerNetworkMounts(sandboxID, roSpec) +func ExtendPolicyWithNetworkingMounts(sandboxRoot string, enforcer SecurityPolicyEnforcer, spec *oci.Spec) error { + networkingMounts := specInternal.GenerateWorkloadContainerNetworkMountsFromRoot(sandboxRoot, spec) if err := enforcer.ExtendDefaultMounts(networkingMounts); err != nil { return err }