From 62fc02c15f47b49495733e7c0deb88118e5c0732 Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Wed, 22 Apr 2026 11:19:42 +0530 Subject: [PATCH] =?UTF-8?q?guest:=20unify=20pod=20model=20=E2=80=94=20repl?= =?UTF-8?q?ace=20VirtualPod=20with=20single=20pod=20abstraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the separate VirtualPod tracking (dedicated type, 7 exported methods, parent cgroup manager, reverse-lookup map) with a unified uvmPod type and a single pods map on Host. All pod types (V1 sandbox, virtual pod, V2 shim) now go through the same code path: - createPodInUVM allocates a cgroup under /pods/{sandboxID} - addContainerToPod tracks container→pod membership - RemoveContainer handles cleanup uniformly Cgroup hierarchy changes from: /containers/{id} (V1 sandbox) /containers/virtual-pods/{virtualPodID} (virtual pod) to: /pods/{sandboxID} (all pod types) Workload containers nest under their pod: /pods/{sandboxID}/{containerID} Signed-off-by: Shreyansh Jain Signed-off-by: Shreyansh Sancheti --- cmd/gcs/main.go | 34 +- internal/guest/runtime/hcsv2/container.go | 3 +- .../runtime/hcsv2/container_stats_test.go | 13 - .../guest/runtime/hcsv2/sandbox_container.go | 10 +- .../runtime/hcsv2/standalone_container.go | 8 +- internal/guest/runtime/hcsv2/uvm.go | 352 +++++------------- .../guest/runtime/hcsv2/workload_container.go | 15 +- test/gcs/cgroup_test.go | 4 +- 8 files changed, 130 insertions(+), 309 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 68ab28d499..d71038a942 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -80,8 +80,8 @@ func readMemoryEvents(startTime time.Time, efdFile *os.File, cgName string, thre count++ var msg string - if strings.HasPrefix(cgName, "/virtual-pods") { - msg = "memory usage for virtual pods cgroup exceeded threshold" + if strings.HasPrefix(cgName, "/pods") { + msg = "memory usage for pods cgroup exceeded threshold" } else { msg = "memory usage for cgroup exceeded threshold" } @@ -327,7 +327,7 @@ func main() { // Setup the UVM cgroups to protect against a workload taking all available // memory and causing the GCS to malfunction we create cgroups: gcs, - // containers, and virtual-pods for multi-pod support. + // containers, and pods for pod support. // // Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy @@ -359,17 +359,17 @@ func main() { } defer containersControl.Delete() //nolint:errcheck - // Create virtual-pods cgroup hierarchy for multi-pod support - // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} - virtualPodsControl, err := cgroup.NewManager("/containers/virtual-pods", &oci.LinuxResources{ + // Create pods cgroup hierarchy for pod support + // This will be the parent for all pod cgroups: /pods/{sandboxID} + podsControl, err := cgroup.NewManager("/pods", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ - Limit: &containersLimit, // Share the same limit as containers + Limit: &containersLimit, }, }) if err != nil { - logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + logrus.WithError(err).Fatal("failed to create pods cgroup") } - defer virtualPodsControl.Delete() //nolint:errcheck + defer podsControl.Delete() //nolint:errcheck gcsControl, err := cgroup.NewManager("/gcs", &oci.LinuxResources{}) if err != nil { @@ -391,10 +391,6 @@ func main() { EnableV4: *v4, } h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - // Initialize virtual pod support in the host - if err := h.InitializeVirtualPodSupport(virtualPodsControl); err != nil { - logrus.WithError(err).Warn("Virtual pod support initialization failed") - } b.AssignHandlers(mux, h) var bridgeIn io.ReadCloser @@ -430,13 +426,13 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() - // Setup OOM monitoring for virtual-pods cgroup - virtualPodsOom, err := virtualPodsControl.OOMEventFD() + // Setup OOM monitoring for pods cgroup + podsOom, err := podsControl.OOMEventFD() if err != nil { - logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + logrus.WithError(err).Fatal("failed to retrieve the pods cgroups oom eventfd") } - virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") - defer virtualPodsOomFile.Close() + podsOomFile := os.NewFile(podsOom, "pods-oomfd") + defer podsOomFile.Close() // time synchronization service if !(*disableTimeSync) { @@ -447,7 +443,7 @@ func main() { go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) - go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) + go readMemoryEvents(startTime, podsOomFile, "/pods", containersLimit, podsControl) err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 0ce930dd65..67d84549af 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -44,7 +44,8 @@ const ( ) type Container struct { - id string + id string + sandboxID string vsock transport.Transport logPath string // path to [logFile]. diff --git a/internal/guest/runtime/hcsv2/container_stats_test.go b/internal/guest/runtime/hcsv2/container_stats_test.go index c1986ca2b1..fa44a4451a 100644 --- a/internal/guest/runtime/hcsv2/container_stats_test.go +++ b/internal/guest/runtime/hcsv2/container_stats_test.go @@ -494,16 +494,3 @@ func TestConvertV2StatsToV1_NilInput(t *testing.T) { t.Error("ConvertV2StatsToV1(nil) should return empty metrics with all nil fields") } } - -func TestHost_InitializeVirtualPodSupport_ErrorCases(t *testing.T) { - host := &Host{} - - // Test with nil input - err := host.InitializeVirtualPodSupport(nil) - if err == nil { - t.Error("Expected error for nil input") - } - if err != nil && err.Error() != "no valid cgroup manager provided for virtual pod support" { - t.Errorf("Unexpected error message: %s", err.Error()) - } -} diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 12590f4dc0..37a68b1732 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -119,14 +119,8 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Set cgroup path - check if this is a virtual pod - if virtualSandboxID != "" { - // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID - } else { - // Traditional sandbox goes under /containers - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 768adda344..742a3716cd 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -15,7 +15,6 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/oc" - "github.com/Microsoft/hcsshim/pkg/annotations" ) func getStandaloneHostnamePath(rootDir string) string { @@ -119,12 +118,7 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec } // Set cgroup path - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - if virtualSandboxID != "" { - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id - } else { - spec.Linux.CgroupsPath = "/containers/" + id - } + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index bbec0b7564..3f41bff5b0 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -73,15 +73,14 @@ func checkValidContainerID(id string, idType string) error { return errors.Errorf("invalid %s id: %s (must match %s)", idType, id, validContainerIDRegex.String()) } -// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods -type VirtualPod struct { - VirtualSandboxID string - MasterSandboxID string - NetworkNamespace string - CgroupPath string - CgroupControl cgroup.Manager // Unified cgroup manager (v1 or v2) - Containers map[string]bool // containerID -> exists - CreatedAt time.Time +// uvmPod tracks pod-level state within the UVM. +type uvmPod struct { + sandboxID string + networkNamespace string + cgroupPath string + cgroupControl cgroup.Manager + containers map[string]bool + createdAt time.Time } // Host is the structure tracking all UVM host state including all containers @@ -93,16 +92,14 @@ type Host struct { externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Virtual pod support for multi-pod scenarios - virtualPodsMutex sync.Mutex - virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod - containerToVirtualPod map[string]string // containerID -> virtualSandboxID - virtualPodsCgroupManager cgroup.Manager // Parent cgroup for all virtual pods + // pods tracks all pod state. Guarded by podsMutex. + podsMutex sync.Mutex + pods map[string]*uvmPod // sandboxRoots maps sandboxID to the resolved sandbox root directory. // Populated via registerSandboxRoot during sandbox creation using // the host-provided OCIBundlePath as source of truth. - // Lock ordering: containersMutex -> sandboxRootsMutex (never reverse). + // Lock ordering: containersMutex -> podsMutex -> sandboxRootsMutex (never reverse). sandboxRootsMutex sync.RWMutex sandboxRoots map[string]string @@ -126,16 +123,15 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s logWriter, ) return &Host{ - containers: make(map[string]*Container), - externalProcesses: make(map[int]*externalProcess), - virtualPods: make(map[string]*VirtualPod), - containerToVirtualPod: make(map[string]string), - sandboxRoots: make(map[string]string), - rtime: rtime, - vsock: vsock, - devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), - securityOptions: securityPolicyOptions, + containers: make(map[string]*Container), + externalProcesses: make(map[int]*externalProcess), + pods: make(map[string]*uvmPod), + sandboxRoots: make(map[string]string), + rtime: rtime, + vsock: vsock, + devNullTransport: &transport.DevNullTransport{}, + hostMounts: newHostMounts(), + securityOptions: securityPolicyOptions, } } @@ -214,26 +210,45 @@ func (h *Host) RemoveContainer(id string) { return } - // Check if this container is part of a virtual pod - virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] - if isVirtualPod { - // Remove from virtual pod tracking - h.RemoveContainerFromVirtualPod(id) - // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - }).Info("Container removed from virtual pod") - } else { - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + + // Do NOT call RemoveNetworkNamespace for virtual pod sandbox containers. + // The host-driven teardown path (TearDownNetworking → RemoveNetNS → removeNIC) + // removes adapters first and then the namespace. Calling it here would fail + // with "contains adapters" because the host hasn't removed them yet. + virtualPodID := c.spec.Annotations[annotations.VirtualPodID] + isVirtualPodSandbox := virtualPodID != "" && id == virtualPodID + if !isVirtualPodSandbox && (!isCRI || criType == "sandbox") { + if err := RemoveNetworkNamespace(context.Background(), id); err != nil { + logrus.WithError(err).WithField(logfields.ContainerID, id).Warn("failed to remove network namespace") } } delete(h.containers, id) + // Extract pod cgroup manager under lock, delete cgroup outside lock to + // avoid holding podsMutex during filesystem I/O. + var cgToDelete cgroup.Manager + h.podsMutex.Lock() + if c.sandboxID != "" { + if pod, exists := h.pods[c.sandboxID]; exists { + delete(pod.containers, id) + if id == c.sandboxID { + cgToDelete = pod.cgroupControl + delete(h.pods, c.sandboxID) + } + } + } + h.podsMutex.Unlock() + + if cgToDelete != nil { + if err := cgToDelete.Delete(); err != nil { + logrus.WithFields(logrus.Fields{ + "sandboxID": c.sandboxID, + }).WithError(err).Warn("failed to delete pod cgroup") + } + } + // Clean up the sandbox root mapping for sandbox containers. if c.isSandbox { h.unregisterSandboxRoot(id) @@ -377,7 +392,8 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] // Check for virtual pod annotation - virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] + virtualPodID := settings.OCISpecification.Annotations[annotations.VirtualPodID] + isVirtualPod := virtualPodID != "" if h.HasSecurityPolicy() { if err = checkValidContainerID(id, "container"); err != nil { @@ -424,35 +440,18 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - // Handle virtual pod logic - if isVirtualPod && isCRI { - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: id, - logfields.VirtualSandboxID: virtualPodID, - "criType": criType, - }).Info("Processing container for virtual pod") - - if criType == "sandbox" { - // This is a virtual pod sandbox - create the virtual pod if it doesn't exist - if _, exists := h.GetVirtualPod(virtualPodID); !exists { - // Use the network namespace ID from the current container spec - // Virtual pods share the same network namespace - networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) - if networkNamespace == "" { - networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) - } - - if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, settings.OCISpecification); err != nil { - return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) - } - } - } - - // Add this container to the virtual pod - if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { - return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) + // Determine the sandboxID for this container. + sandboxID := id + if criType == "container" { + sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] + if sid == "" { + return nil, errors.Errorf("workload container missing sandbox ID annotation") } + sandboxID = sid + } else if virtualPodID != "" { + sandboxID = virtualPodID } + c.sandboxID = sandboxID // Normally we would be doing policy checking here at the start of our // "policy gated function". However, we can't for create container as we @@ -462,8 +461,6 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM // container id var namespaceID string - // for sandbox container sandboxID is same as container id - sandboxID := id if isCRI { switch criType { case "sandbox": @@ -503,9 +500,12 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if err := securitypolicy.ExtendPolicyWithNetworkingMounts(id, h.securityOptions.PolicyEnforcer, settings.OCISpecification); err != nil { return nil, err } + + if err := h.createPodInUVM(sandboxID, settings.OCISpecification, namespaceID); err != nil { + return nil, err + } case "container": sid, ok := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] - sandboxID = sid if h.HasSecurityPolicy() { if err = checkValidContainerID(sid, "sandbox"); err != nil { return nil, err @@ -559,6 +559,11 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } } + // Register container with its pod for CRI containers. + if isCRI { + h.addContainerToPod(sandboxID, id) + } + // don't specialize tee logs (both files and mounts) just for workload containers // add log directory mount before enforcing (mount) policy if logDirMount := settings.OCISpecification.Annotations[annotations.LCOWTeeLogDirMount]; logDirMount != "" { @@ -1489,194 +1494,43 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) return oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) } -// Virtual Pod Management Methods - -// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods -func (h *Host) InitializeVirtualPodSupport(mgr cgroup.Manager) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - if mgr == nil { - return errors.New("no valid cgroup manager provided for virtual pod support") - } - - h.virtualPodsCgroupManager = mgr - logrus.Info("Virtual pod support initialized") - return nil -} - -// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace -func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, pSpec *specs.Spec) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod already exists - if _, exists := h.virtualPods[virtualSandboxID]; exists { - return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) - } - - // Extract resource limits if provided +// createPodInUVM allocates a cgroup for a pod and registers it in the host. +func (h *Host) createPodInUVM(sid string, pSpec *specs.Spec, nsID string) error { + cgroupPath := path.Join("/pods", sid) resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Creating virtual pod with specified resources") - } else { - logrus.WithField(logfields.VirtualSandboxID, virtualSandboxID).Info("Creating pod cgroup with default resources as none were specified") } - - if h.virtualPodsCgroupManager == nil { - return fmt.Errorf("no virtual pod cgroup manager available") - } - - cgroupPath := path.Join("/containers/virtual-pods", virtualSandboxID) cgroupControl, err := cgroup.NewManager(cgroupPath, resources) if err != nil { - return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) - } - logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod cgroup") - - // Create virtual pod structure - virtualPod := &VirtualPod{ - VirtualSandboxID: virtualSandboxID, - MasterSandboxID: masterSandboxID, - NetworkNamespace: networkNamespace, - CgroupPath: cgroupPath, - CgroupControl: cgroupControl, - Containers: make(map[string]bool), - CreatedAt: time.Now(), - } - - h.virtualPods[virtualSandboxID] = virtualPod - - logrus.WithFields(logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - "masterSandboxID": masterSandboxID, - "cgroupPath": cgroupPath, - "networkNamespace": networkNamespace, - }).Info("Virtual pod created successfully") - - return nil -} - -// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) -func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { - return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) -} - -// GetVirtualPod retrieves a virtual pod by its virtualSandboxID -func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - vp, exists := h.virtualPods[virtualSandboxID] - return vp, exists -} - -// AddContainerToVirtualPod associates a container with a virtual pod -func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod exists - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) + return fmt.Errorf("failed to create cgroup for pod %s: %w", sid, err) + } + h.podsMutex.Lock() + defer h.podsMutex.Unlock() + if _, exists := h.pods[sid]; exists { + _ = cgroupControl.Delete() + return fmt.Errorf("pod %s already exists", sid) + } + h.pods[sid] = &uvmPod{ + sandboxID: sid, + networkNamespace: nsID, + cgroupPath: cgroupPath, + cgroupControl: cgroupControl, + containers: make(map[string]bool), + createdAt: time.Now(), } - - // Add container to virtual pod - vp.Containers[containerID] = true - h.containerToVirtualPod[containerID] = virtualSandboxID - logrus.WithFields(logrus.Fields{ - logfields.ContainerID: containerID, - logfields.VirtualSandboxID: virtualSandboxID, - }).Info("Container added to virtual pod") - + "sandboxID": sid, + "cgroupPath": cgroupPath, + }).Info("pod created in UVM") return nil } -// RemoveContainerFromVirtualPod removes a container from a virtual pod -func (h *Host) RemoveContainerFromVirtualPod(containerID string) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - virtualSandboxID, exists := h.containerToVirtualPod[containerID] - if !exists { - return // Container not in any virtual pod - } - - ctx, entry := log.SetEntry(context.Background(), logrus.Fields{ - logfields.VirtualSandboxID: virtualSandboxID, - logfields.ContainerID: containerID, - }) - - // Remove from virtual pod - if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { - delete(vp.Containers, containerID) - - // If this is the sandbox container, clean up the network namespace adapters first, - // then remove the namespace. - - // Do NOT call [RemoveNetworkNamespace] here. - // When cleaning up the virtual pod's sandbox container on the host, - // [UtilityVM.TearDownNetworking] calls [UtilityVM.RemoveNetNS], - // which, via [UtilityVM.removeNIC], removes the NICs from the uVM and - // sends [guestresource.ResourceTypeNetwork] RPCs that ultimately cal [modifyNetwork] - // to remove adapters from guest tracking. - // However, that happens AFTER this function runs. - // Calling [RemoveNetworkNamespace] here would - // fail with "contains adapters" since the host hasn't removed them yet. - // - // The host-driven path handles cleanup in the correct order. - if containerID == virtualSandboxID && vp.NetworkNamespace != "" { - entry.WithField(logfields.NamespaceID, vp.NetworkNamespace).Debug("skipping virtual pod network namespace removal") - } - - // If this was the last container, cleanup the virtual pod - if len(vp.Containers) == 0 { - h.cleanupVirtualPod(ctx, virtualSandboxID) - } - } - - delete(h.containerToVirtualPod, containerID) - - entry.Info("Container removed from virtual pod") -} - -func (h *Host) cleanupVirtualPod(ctx context.Context, virtualSandboxID string) { - // logfields set on [Host.RemoveContainerFromVirtualPod] - entry := log.G(ctx) - - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - entry.Warn("attempted to cleanup non-existent virtual sandbox pod") - return // virtual pod does not exist - } - - // Delete the cgroup - if vp.CgroupControl != nil { - if err := vp.CgroupControl.Delete(); err != nil { - entry.WithError(err).Warn("failed to delete virtual pod cgroup") - } - } - - // NOTE: Do NOT call [RemoveNetworkNamespace] here. - // See comment in [Host.RemoveContainerFromVirtualPod] for more info. - - // Clean up the virtual pod root directory which contains hostname, hosts, - // resolv.conf, and logs/. These are NOT cleaned by Container.Delete() because - // they live under /run/gcs/c/virtual-pods// which is separate from the - // container's ociBundlePath (/run/gcs/c//). - if vpRootDir := specGuest.VirtualPodRootDir(virtualSandboxID); vpRootDir != "" { - if err := os.RemoveAll(vpRootDir); err != nil { - entry.WithField("path", vpRootDir).WithError(err).Warn("Failed to remove virtual pod root directory") - } else { - entry.WithField("path", vpRootDir).Debug("Removed virtual pod root directory") - } +// addContainerToPod registers a container as belonging to a pod. +func (h *Host) addContainerToPod(sandboxID, containerID string) { + h.podsMutex.Lock() + defer h.podsMutex.Unlock() + if pod, exists := h.pods[sandboxID]; exists { + pod.containers[containerID] = true } - - delete(h.virtualPods, virtualSandboxID) - entry.Info("Virtual pod cleaned up") } diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index d118a4d35c..633d8b2498 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -224,17 +224,12 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id, sandboxRoot strin } } - // Check if this is a virtual pod container - virtualPodID := spec.Annotations[annotations.VirtualPodID] - - // Set cgroup path - check if this is a virtual pod container - if virtualPodID != "" { - // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id - } else { - // Regular containers go under /containers - spec.Linux.CgroupsPath = "/containers/" + id + // Set cgroup path under the sandbox's pod cgroup. + sandboxID := spec.Annotations[annotations.KubernetesSandboxID] + if sandboxID == "" { + sandboxID = id } + spec.Linux.CgroupsPath = "/pods/" + sandboxID + "/" + id if spec.Windows != nil { // we only support Nvidia gpus right now diff --git a/test/gcs/cgroup_test.go b/test/gcs/cgroup_test.go index 72b84808bf..71ee644a01 100644 --- a/test/gcs/cgroup_test.go +++ b/test/gcs/cgroup_test.go @@ -435,8 +435,8 @@ func TestOOMEventFD_ContainerKill(t *testing.T) { // --- Set up OOMEventFD on the container's cgroup --- // - // Standalone containers get cgroup path "/containers/". - cgroupPath := "/containers/" + id + // Standalone containers get cgroup path "/pods/". + cgroupPath := "/pods/" + id // LoadManager works for both v1 and v2 — the cgroup already exists (created by runc). mgr, err := cgroup.LoadManager(cgroupPath)