Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ In this folder, you should find technical references material of the hcloud-clou
- [Version Policy](version-policy.md)
- [Load Balancer Annotations](load_balancer_annotations.md)
- [Load Balancer Environment Variables](load_balancer_envs.md)
- [Instance Cache](instance_cache.md)
26 changes: 26 additions & 0 deletions docs/reference/instance_cache.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Instance Cache

> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_INSTANCES_CACHE_MODE=off` to opt out.

The instance cache reduces calls to the Hetzner Cloud API made by the `InstancesV2` controller, which looks up Servers by ID or name to reconcile Node state. The cache sits between the controller and the Hetzner Cloud API; behavior is controlled by the environment variables below.

## Environment Variables

| Name | Type | Default | Description |
| ----------------------------- | --------------------------------- | ------------ | ------------------------------------------------------------------------------------- |
| `HCLOUD_INSTANCES_CACHE_MODE` | `all-server \| per-server \| off` | `all-server` | Selects the caching strategy. See [Modes](#modes) below. |
| `HCLOUD_INSTANCES_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). |

## Modes

### `all-server`

Fetches every Server in the project with a single `GET /servers` call and serves all subsequent `ByID` / `ByName` lookups from the resulting snapshot until the TTL expires. The snapshot is refreshed on the next lookup after expiry. On a cache miss within the TTL (e.g. a freshly created Server), one rate-limited refresh per TTL window is allowed to pick up the new Server; further misses in the same window return without an API call.

### `per-server`

Caches each Server individually with its own expiration. A `ByID` / `ByName` lookup either returns a non-expired entry or issues a `GET /servers/{id}` (or `GET /servers?name=`) call and stores the result. Expired entries are evicted lazily when other entries are inserted.

### `off`

Disables caching entirely. Every lookup goes directly to the API.
36 changes: 22 additions & 14 deletions hcloud/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/metadata"
)
Expand All @@ -50,13 +51,14 @@ const (
var providerVersion = "unknown"

type cloud struct {
client *hcloud.Client
robotClient hrobot.RobotClient
cfg config.HCCMConfiguration
recorder record.EventRecorder
networkID int64
cidr string
nodeLister corelisters.NodeLister
client *hcloud.Client
robotClient hrobot.RobotClient
instanceCache servercache.ServerCache
cfg config.HCCMConfiguration
recorder record.EventRecorder
networkID int64
cidr string
nodeLister corelisters.NodeLister
}

func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) {
Expand Down Expand Up @@ -144,13 +146,19 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int

klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion)

instanceCache, err := servercache.New(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL)
if err != nil {
return nil, fmt.Errorf("%s: %w", op, err)
}

return &cloud{
client: client,
robotClient: robotClient,
cfg: cfg,
networkID: networkID,
cidr: cidr,
nodeLister: nodeLister,
client: client,
robotClient: robotClient,
instanceCache: instanceCache,
cfg: cfg,
networkID: networkID,
cidr: cidr,
nodeLister: nodeLister,
}, nil
}

Expand All @@ -175,7 +183,7 @@ func (c *cloud) Instances() (cloudprovider.Instances, bool) {
}

func (c *cloud) InstancesV2() (cloudprovider.InstancesV2, bool) {
return newInstances(c.client, c.robotClient, c.recorder, c.networkID, c.cfg), true
return newInstances(c.client, c.robotClient, c.instanceCache, c.recorder, c.networkID, c.cfg), true
}

func (c *cloud) Zones() (cloudprovider.Zones, bool) {
Expand Down
5 changes: 5 additions & 0 deletions hcloud/cloud_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"k8s.io/client-go/tools/record"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
"github.com/hetznercloud/hcloud-go/v2/hcloud/schema"
Expand All @@ -41,6 +42,7 @@ type testEnv struct {
Mux *http.ServeMux
Client *hcloud.Client
RobotClient hrobot.RobotClient
ServerCache servercache.ServerCache
Recorder record.EventRecorder
Cfg config.HCCMConfiguration
}
Expand All @@ -51,6 +53,7 @@ func (env *testEnv) Teardown() {
env.Mux = nil
env.Client = nil
env.RobotClient = nil
env.ServerCache = nil
env.Recorder = nil
}

Expand All @@ -66,6 +69,7 @@ func newTestEnv() testEnv {
)
robotClient := hrobot.NewBasicAuthClient("", "")
robotClient.SetBaseURL(server.URL + "/robot")
serverCache := servercache.NewPerServerCache(client, "instances_v2", 10*time.Second)
recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"})

cfg := config.HCCMConfiguration{}
Expand All @@ -76,6 +80,7 @@ func newTestEnv() testEnv {
Mux: mux,
Client: client,
RobotClient: robotClient,
ServerCache: serverCache,
Recorder: recorder,
Cfg: cfg,
}
Expand Down
12 changes: 9 additions & 3 deletions hcloud/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/legacydatacenter"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache"
"github.com/hetznercloud/hcloud-go/v2/hcloud"
)

Expand All @@ -44,6 +45,7 @@ const (
type instances struct {
client *hcloud.Client
robotClient hrobot.RobotClient
serverCache servercache.ServerCache
recorder record.EventRecorder
networkID int64
cfg config.HCCMConfiguration
Expand All @@ -57,13 +59,15 @@ var (
func newInstances(
client *hcloud.Client,
robotClient hrobot.RobotClient,
serverCache servercache.ServerCache,
recorder record.EventRecorder,
networkID int64,
cfg config.HCCMConfiguration,
) *instances {
return &instances{
client,
robotClient,
serverCache,
recorder,
networkID,
cfg,
Expand All @@ -80,13 +84,12 @@ func (i *instances) lookupServer(
if node.Spec.ProviderID != "" {
var serverID int64
serverID, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID)

if err != nil {
return nil, fmt.Errorf("failed to convert provider id to server id: %w", err)
}

if isCloudServer {
server, err := getCloudServerByID(ctx, i.client, serverID)
server, err := i.serverCache.ByID(ctx, serverID)
if err != nil {
return nil, fmt.Errorf("failed to get hcloud server \"%d\": %w", serverID, err)
}
Expand Down Expand Up @@ -115,7 +118,7 @@ func (i *instances) lookupServer(

// If the node has no provider ID we try to find the server by name from
// both sources. In case we find two servers, we return an error.
cloudServer, err := getCloudServerByName(ctx, i.client, node.Name)
cloudServer, err := i.serverCache.ByName(ctx, node.Name)
if err != nil {
return nil, fmt.Errorf("failed to get hcloud server %q: %w", node.Name, err)
}
Expand Down Expand Up @@ -153,6 +156,7 @@ func (i *instances) lookupServer(
func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) {
const op = "hcloud/instancesv2.InstanceExists"
metrics.OperationCalled.WithLabelValues(op).Inc()
klog.V(4).InfoS("InstanceExists called", "node", node.Name, "providerID", node.Spec.ProviderID)

server, err := i.lookupServer(ctx, node)
if err != nil {
Expand All @@ -165,6 +169,7 @@ func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool
func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bool, error) {
const op = "hcloud/instancesv2.InstanceShutdown"
metrics.OperationCalled.WithLabelValues(op).Inc()
klog.V(4).InfoS("InstanceShutdown called", "node", node.Name, "providerID", node.Spec.ProviderID)

server, err := i.lookupServer(ctx, node)
if err != nil {
Expand All @@ -188,6 +193,7 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo
func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*cloudprovider.InstanceMetadata, error) {
const op = "hcloud/instancesv2.InstanceMetadata"
metrics.OperationCalled.WithLabelValues(op).Inc()
klog.V(4).InfoS("InstanceMetadata called", "node", node.Name, "providerID", node.Spec.ProviderID)

server, err := i.lookupServer(ctx, node)
if err != nil {
Expand Down
41 changes: 26 additions & 15 deletions hcloud/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func TestInstances_InstanceExists(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg)
instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg)

tests := []struct {
name string
Expand All @@ -104,7 +104,8 @@ func TestInstances_InstanceExists(t *testing.T) {
Spec: corev1.NodeSpec{ProviderID: "hcloud://1"},
},
expected: true,
}, {
},
{
name: "existing robot server by id",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -123,49 +124,56 @@ func TestInstances_InstanceExists(t *testing.T) {
Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-321"},
},
expected: true,
}, {
},
{
name: "missing server by id",
node: &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hcloud://2"},
},
expected: false,
}, {
},
{
name: "missing robot server by id",
node: &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hrobot://322"},
},
expected: false,
}, {
},
{
name: "missing robot server by (legacy) id",
node: &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-322"},
},
expected: false,
}, {
},
{
name: "existing server by name",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "foobar",
},
},
expected: true,
}, {
},
{
name: "existing robot server by name",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "robot-server1",
},
},
expected: true,
}, {
},
{
name: "missing server by name",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "barfoo",
},
},
expected: false,
}, {
},
{
name: "missing robot server by name",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -211,7 +219,7 @@ func TestInstances_InstanceShutdown(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg)
instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg)
env.Mux.HandleFunc("/robot/server/3", func(w http.ResponseWriter, _ *http.Request) {
json.NewEncoder(w).Encode(hrobotmodels.ServerResponse{
Server: hrobotmodels.Server{
Expand Down Expand Up @@ -274,13 +282,15 @@ func TestInstances_InstanceShutdown(t *testing.T) {
Spec: corev1.NodeSpec{ProviderID: "hcloud://1"},
},
expected: false,
}, {
},
{
name: "[cloud] shutdown",
node: &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hcloud://2"},
},
expected: true,
}, {
},
{
name: "[robot] running",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -289,7 +299,8 @@ func TestInstances_InstanceShutdown(t *testing.T) {
Spec: corev1.NodeSpec{ProviderID: "hrobot://3"},
},
expected: false,
}, {
},
{
name: "[robot] shutdown",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand Down Expand Up @@ -346,7 +357,7 @@ func TestInstances_InstanceMetadata(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg)
instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg)

metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{
Spec: corev1.NodeSpec{ProviderID: "hcloud://1"},
Expand Down Expand Up @@ -390,7 +401,7 @@ func TestInstances_InstanceMetadataRobotServer(t *testing.T) {
})
})

instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg)
instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg)

metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Expand Down
Loading