Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions cmd/api/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,9 +131,17 @@ type ImagesAutoDeleteConfig struct {
Allowed []string `koanf:"allowed"`
}

// OCICacheGCConfig holds settings for the OCI blob cache garbage collector.
type OCICacheGCConfig struct {
Enabled bool `koanf:"enabled"`
Interval string `koanf:"interval"`
MinBlobAge string `koanf:"min_blob_age"`
}

// ImagesConfig holds image-management settings.
type ImagesConfig struct {
AutoDelete ImagesAutoDeleteConfig `koanf:"auto_delete"`
OCICacheGC OCICacheGCConfig `koanf:"oci_cache_gc"`
}

// BuildConfig holds source-to-image build system settings.
Expand Down Expand Up @@ -346,6 +354,11 @@ func defaultConfig() *Config {
UnusedFor: "720h",
Allowed: []string{},
},
OCICacheGC: OCICacheGCConfig{
Enabled: false,
Interval: "1h",
MinBlobAge: "1h",
},
},

Build: BuildConfig{
Expand Down Expand Up @@ -563,6 +576,20 @@ func (c *Config) Validate() error {
for i, pattern := range c.Images.AutoDelete.Allowed {
c.Images.AutoDelete.Allowed[i] = strings.TrimSpace(pattern)
}
ociCacheGCInterval, err := time.ParseDuration(c.Images.OCICacheGC.Interval)
if err != nil {
return fmt.Errorf("images.oci_cache_gc.interval must be a valid duration, got %q: %w", c.Images.OCICacheGC.Interval, err)
}
if ociCacheGCInterval <= 0 {
return fmt.Errorf("images.oci_cache_gc.interval must be positive, got %q", c.Images.OCICacheGC.Interval)
}
ociCacheGCMinBlobAge, err := time.ParseDuration(c.Images.OCICacheGC.MinBlobAge)
if err != nil {
return fmt.Errorf("images.oci_cache_gc.min_blob_age must be a valid duration, got %q: %w", c.Images.OCICacheGC.MinBlobAge, err)
}
if ociCacheGCMinBlobAge < 0 {
return fmt.Errorf("images.oci_cache_gc.min_blob_age cannot be negative, got %q", c.Images.OCICacheGC.MinBlobAge)
}
algorithm := strings.ToLower(c.Snapshot.CompressionDefault.Algorithm)
c.Snapshot.CompressionDefault.Algorithm = algorithm
if c.Snapshot.CompressionDefault.Enabled {
Expand Down
60 changes: 60 additions & 0 deletions cmd/api/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,15 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
if len(cfg.Images.AutoDelete.Allowed) != 0 {
t.Fatalf("expected default images.auto_delete.allowed to be empty, got %v", cfg.Images.AutoDelete.Allowed)
}
if cfg.Images.OCICacheGC.Enabled {
t.Fatalf("expected default images.oci_cache_gc.enabled to be false")
}
if cfg.Images.OCICacheGC.Interval != "1h" {
t.Fatalf("expected default images.oci_cache_gc.interval to be 1h, got %q", cfg.Images.OCICacheGC.Interval)
}
if cfg.Images.OCICacheGC.MinBlobAge != "1h" {
t.Fatalf("expected default images.oci_cache_gc.min_blob_age to be 1h, got %q", cfg.Images.OCICacheGC.MinBlobAge)
}
if cfg.Instances.LifecycleEventBufferSize != 256 {
t.Fatalf("expected default instances.lifecycle_event_buffer_size to be 256, got %d", cfg.Instances.LifecycleEventBufferSize)
}
Expand Down Expand Up @@ -247,6 +256,57 @@ func TestValidateRejectsInvalidImageAutoDeleteUnusedFor(t *testing.T) {
}
}

func TestLoadUsesDefaultOCICacheGCSettingsWhenEnabledOnly(t *testing.T) {
tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "config.yaml")
if err := os.WriteFile(cfgPath, []byte("images:\n oci_cache_gc:\n enabled: true\n"), 0600); err != nil {
t.Fatalf("write temp config: %v", err)
}

cfg, err := Load(cfgPath)
if err != nil {
t.Fatalf("load config: %v", err)
}

if !cfg.Images.OCICacheGC.Enabled {
t.Fatalf("expected images.oci_cache_gc.enabled override to be true")
}
if cfg.Images.OCICacheGC.Interval != "1h" {
t.Fatalf("expected default images.oci_cache_gc.interval to remain 1h, got %q", cfg.Images.OCICacheGC.Interval)
}
if cfg.Images.OCICacheGC.MinBlobAge != "1h" {
t.Fatalf("expected default images.oci_cache_gc.min_blob_age to remain 1h, got %q", cfg.Images.OCICacheGC.MinBlobAge)
}
}

func TestValidateRejectsInvalidOCICacheGCInterval(t *testing.T) {
cfg := defaultConfig()
cfg.Images.OCICacheGC.Interval = "not-a-duration"

err := cfg.Validate()
if err == nil {
t.Fatalf("expected validation error for invalid images.oci_cache_gc.interval")
}

cfg = defaultConfig()
cfg.Images.OCICacheGC.Interval = "0s"

err = cfg.Validate()
if err == nil || !strings.Contains(err.Error(), "must be positive") {
t.Fatalf("expected positive validation error for zero images.oci_cache_gc.interval, got %v", err)
}
}

func TestValidateRejectsNegativeOCICacheGCMinBlobAge(t *testing.T) {
cfg := defaultConfig()
cfg.Images.OCICacheGC.MinBlobAge = "-1s"

err := cfg.Validate()
if err == nil || !strings.Contains(err.Error(), "cannot be negative") {
t.Fatalf("expected non-negative validation error for images.oci_cache_gc.min_blob_age, got %v", err)
}
}

func TestValidateTrimsImageAutoDeleteAllowedPatterns(t *testing.T) {
cfg := defaultConfig()
cfg.Images.AutoDelete.Allowed = []string{" docker.io/library/* ", " ", "ghcr.io/kernel/*"}
Expand Down
47 changes: 47 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
loglib "github.com/kernel/hypeman/lib/logger"
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/ocicachegc"
"github.com/kernel/hypeman/lib/otel"
"github.com/kernel/hypeman/lib/paths"
"github.com/kernel/hypeman/lib/registry"
Expand Down Expand Up @@ -98,6 +99,37 @@ func startImageRetentionController(grp *errgroup.Group, ctx context.Context, con
return true
}

type ociCacheGCRunner interface {
Run(ctx context.Context) error
}

func configureOCICacheGC(cfg *config.Config, logger *slog.Logger, meter metric.Meter) (ociCacheGCRunner, error) {
if cfg == nil || !cfg.Images.OCICacheGC.Enabled {
return nil, nil
}

interval, err := time.ParseDuration(cfg.Images.OCICacheGC.Interval)
if err != nil {
return nil, fmt.Errorf("invalid images.oci_cache_gc.interval %q: %w", cfg.Images.OCICacheGC.Interval, err)
}
minBlobAge, err := time.ParseDuration(cfg.Images.OCICacheGC.MinBlobAge)
if err != nil {
return nil, fmt.Errorf("invalid images.oci_cache_gc.min_blob_age %q: %w", cfg.Images.OCICacheGC.MinBlobAge, err)
}

return ocicachegc.NewCollector(paths.New(cfg.DataDir), interval, minBlobAge, logger, meter)
}

func startOCICacheGC(grp *errgroup.Group, ctx context.Context, runner ociCacheGCRunner) bool {
if grp == nil || runner == nil {
return false
}
grp.Go(func() error {
return runner.Run(ctx)
})
return true
}

func run() error {
// Load config early for OTel initialization
// Config path can be specified via CONFIG_PATH env var or defaults to platform-specific locations
Expand Down Expand Up @@ -491,6 +523,21 @@ func run() error {
logger.Info("image auto-delete enabled", "unused_for", app.Config.Images.AutoDelete.UnusedFor)
}

ociGC, err := configureOCICacheGC(
app.Config,
logger,
otelProvider.MeterFor(loglib.SubsystemImages),
)
if err != nil {
return err
}
if startOCICacheGC(grp, gctx, ociGC) {
logger.Info("oci cache gc enabled",
"interval", app.Config.Images.OCICacheGC.Interval,
"min_blob_age", app.Config.Images.OCICacheGC.MinBlobAge,
)
}

// Start build manager background services (vsock handler for builder VMs)
if err := app.BuildManager.Start(gctx); err != nil {
logger.Error("failed to start build manager", "error", err)
Expand Down
112 changes: 112 additions & 0 deletions cmd/api/oci_cache_gc_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package main

import (
"context"
"io"
"log/slog"
"os"
"path/filepath"
"sync/atomic"
"testing"
"time"

"github.com/kernel/hypeman/cmd/api/config"
"golang.org/x/sync/errgroup"
)

type stubOCICacheGCRunner struct {
runCount atomic.Int32
}

func (s *stubOCICacheGCRunner) Run(ctx context.Context) error {
s.runCount.Add(1)
<-ctx.Done()
return nil
}

func loadTestConfig(t *testing.T) *config.Config {
t.Helper()

tmp := t.TempDir()
cfgPath := filepath.Join(tmp, "config.yaml")
if err := os.WriteFile(cfgPath, []byte("{}\n"), 0o600); err != nil {
t.Fatalf("write temp config: %v", err)
}

cfg, err := config.Load(cfgPath)
if err != nil {
t.Fatalf("load temp config: %v", err)
}
return cfg
}

func TestConfigureOCICacheGCSkipsDisabledConfig(t *testing.T) {
cfg := loadTestConfig(t)

runner, err := configureOCICacheGC(cfg, slog.New(slog.NewTextHandler(io.Discard, nil)), nil)
if err != nil {
t.Fatalf("configure disabled oci cache gc: %v", err)
}
if runner != nil {
t.Fatalf("expected disabled oci cache gc to return nil runner")
}
}

func TestConfigureOCICacheGCBuildsCollectorWhenEnabled(t *testing.T) {
cfg := loadTestConfig(t)
cfg.Images.OCICacheGC.Enabled = true
cfg.Images.OCICacheGC.Interval = "2m"
cfg.Images.OCICacheGC.MinBlobAge = "30s"

runner, err := configureOCICacheGC(cfg, slog.New(slog.NewTextHandler(io.Discard, nil)), nil)
if err != nil {
t.Fatalf("configure enabled oci cache gc: %v", err)
}
if runner == nil {
t.Fatalf("expected enabled oci cache gc to return runner")
}
}

func TestConfigureOCICacheGCRejectsInvalidInterval(t *testing.T) {
cfg := loadTestConfig(t)
cfg.Images.OCICacheGC.Enabled = true
cfg.Images.OCICacheGC.Interval = "0s"

if _, err := configureOCICacheGC(cfg, slog.New(slog.NewTextHandler(io.Discard, nil)), nil); err == nil {
t.Fatalf("expected invalid oci cache gc interval to fail")
}
}

func TestStartOCICacheGCSkipsNilRunner(t *testing.T) {
grp, ctx := errgroup.WithContext(context.Background())

started := startOCICacheGC(grp, ctx, nil)
if started {
t.Fatalf("expected nil oci cache gc runner not to start")
}
}

func TestStartOCICacheGCStartsRunner(t *testing.T) {
grp, ctx := errgroup.WithContext(context.Background())
ctx, cancel := context.WithCancel(ctx)
defer cancel()

runner := &stubOCICacheGCRunner{}
started := startOCICacheGC(grp, ctx, runner)
if !started {
t.Fatalf("expected oci cache gc runner to start")
}

deadline := time.Now().Add(time.Second)
for runner.runCount.Load() == 0 && time.Now().Before(deadline) {
time.Sleep(10 * time.Millisecond)
}
if runner.runCount.Load() != 1 {
t.Fatalf("expected runner to be started once, got %d", runner.runCount.Load())
}

cancel()
if err := grp.Wait(); err != nil {
t.Fatalf("wait for oci cache gc runner: %v", err)
}
}
5 changes: 5 additions & 0 deletions config.example.darwin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ logging:
# - docker.io/library/* # match normalized repository names
# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository
# # only affects data_dir/images, not the shared OCI cache
# oci_cache_gc:
# enabled: false # mark-and-sweep GC for data_dir/system/oci-cache
# interval: 1h # how often to run a sweep
# min_blob_age: 1h # grace period; blobs written more recently are kept
# # to avoid racing with concurrent pulls

# =============================================================================
# Caddy / Ingress Configuration
Expand Down
5 changes: 5 additions & 0 deletions config.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ data_dir: /var/lib/hypeman
# - docker.io/library/* # match normalized repository names
# - ghcr.io/kernel/* # use ["*"] to allow deletion for every repository
# # only affects data_dir/images, not the shared OCI cache
# oci_cache_gc:
# enabled: false # mark-and-sweep GC for data_dir/system/oci-cache
# interval: 1h # how often to run a sweep
# min_blob_age: 1h # grace period; blobs written more recently are kept
# # to avoid racing with concurrent pulls

# =============================================================================
# Caddy / Ingress Configuration
Expand Down
2 changes: 1 addition & 1 deletion lib/imageretention/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ When auto-delete is enabled:

- The server runs a retention sweep on startup and then every minute.
- Only converted cached images under `data_dir/images` are eligible for deletion.
- Shared OCI cache data under `data_dir/system/oci-cache` is not modified.
- Shared OCI cache data under `data_dir/system/oci-cache` is not modified by this feature; see `lib/ocicachegc` for a separate mark-and-sweep collector that reclaims orphaned blobs from that directory.
- An image repository must also match at least one `allowed` pattern before any retention state is recorded or deletion is attempted.

An image is considered in use if any persisted instance metadata or snapshot record still references it. As long as at least one such reference exists, the image is protected from deletion.
Expand Down
56 changes: 56 additions & 0 deletions lib/ocicachegc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# OCI Cache GC

Mark-and-sweep garbage collector for the shared OCI cache at
`data_dir/system/oci-cache`.

The cache is populated every time an image is pulled or pushed and was
previously write-only: nothing ever removed layer, config, or manifest
blobs, so the cache grew unbounded. This collector reclaims the space
used by manifests and layers that are no longer referenced from
`index.json`.

## Configuration

```yaml
images:
oci_cache_gc:
enabled: false
interval: 1h
min_blob_age: 1h
```

When enabled, the server runs one pass immediately and then every
`interval` until shutdown.

## Algorithm

1. **Mark.** Read `index.json` and walk every referenced descriptor. For
each manifest or manifest-index blob we descend into its `config`,
`layers`, `manifests`, and `subject` references. The set of visited
digests is the live set.
2. **Sweep.** List `blobs/sha256/`. Delete every file whose name is a
valid 64-char hex digest, is absent from the live set, and whose
`mtime` is older than `min_blob_age`.

Blobs that are referenced but unparseable are kept as opaque leaves; the
collector never deletes a blob it cannot prove is dead.

## Concurrency

Pulls (`layout.AppendImage`) and pushes (`BlobStore.Put`) write blobs
before updating `index.json`. During that window a blob exists on disk
but is not yet in the live set. `min_blob_age` is the grace period that
protects these in-flight writes — it should comfortably exceed the time
it takes to pull the largest image in your environment.

Temporary files (`<digest>.tmp` used by `BlobStore.Put`) are ignored
entirely because they do not match the blob filename pattern.

## Metrics

| Metric | Type | Description |
| ------ | ---- | ----------- |
| `hypeman_oci_cache_gc_sweeps_total` | counter | Sweeps, tagged by status |
| `hypeman_oci_cache_gc_sweep_duration_seconds` | histogram | Sweep duration |
| `hypeman_oci_cache_gc_deleted_blobs_total` | counter | Blobs deleted |
| `hypeman_oci_cache_gc_deleted_bytes_total` | counter | Bytes reclaimed |
Loading
Loading