diff --git a/docs/reference/README.md b/docs/reference/README.md index 82e995f2..12942c0b 100644 --- a/docs/reference/README.md +++ b/docs/reference/README.md @@ -6,3 +6,4 @@ In this folder, you should find technical references material of the hcloud-clou - [Version Policy](version-policy.md) - [Load Balancer Annotations](load_balancer_annotations.md) - [Load Balancer Environment Variables](load_balancer_envs.md) +- [Instance Cache](instance_cache.md) diff --git a/docs/reference/instance_cache.md b/docs/reference/instance_cache.md new file mode 100644 index 00000000..6ba6491a --- /dev/null +++ b/docs/reference/instance_cache.md @@ -0,0 +1,26 @@ +# Instance Cache + +> **Experimental:** Instance caching is experimental, breaking changes may occur within minor releases. We believe the implementation is safe in practice — that is why it ships enabled by default (`all-server`). Set `HCLOUD_INSTANCES_CACHE_MODE=off` to opt out. + +The instance cache reduces calls to the Hetzner Cloud API made by the `InstancesV2` controller, which looks up Servers by ID or name to reconcile Node state. The cache sits between the controller and the Hetzner Cloud API; behavior is controlled by the environment variables below. + +## Environment Variables + +| Name | Type | Default | Description | +| ----------------------------- | --------------------------------- | ------------ | ------------------------------------------------------------------------------------- | +| `HCLOUD_INSTANCES_CACHE_MODE` | `all-server \| per-server \| off` | `all-server` | Selects the caching strategy. See [Modes](#modes) below. | +| `HCLOUD_INSTANCES_CACHE_TTL` | `duration` | `10s` | Lifetime of cached entries. Accepts any Go `time.Duration` string (e.g. `30s`, `2m`). | + +## Modes + +### `all-server` + +Fetches every Server in the project with a single `GET /servers` call and serves all subsequent `ByID` / `ByName` lookups from the resulting snapshot until the TTL expires. The snapshot is refreshed on the next lookup after expiry. On a cache miss within the TTL (e.g. a freshly created Server), one rate-limited refresh per TTL window is allowed to pick up the new Server; further misses in the same window return without an API call. + +### `per-server` + +Caches each Server individually with its own expiration. A `ByID` / `ByName` lookup either returns a non-expired entry or issues a `GET /servers/{id}` (or `GET /servers?name=`) call and stores the result. Expired entries are evicted lazily when other entries are inserted. + +### `off` + +Disables caching entirely. Every lookup goes directly to the API. diff --git a/hcloud/cloud.go b/hcloud/cloud.go index ab1bd4cd..63d2fbe7 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -37,6 +37,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/robot" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/metadata" ) @@ -50,13 +51,14 @@ const ( var providerVersion = "unknown" type cloud struct { - client *hcloud.Client - robotClient hrobot.RobotClient - cfg config.HCCMConfiguration - recorder record.EventRecorder - networkID int64 - cidr string - nodeLister corelisters.NodeLister + client *hcloud.Client + robotClient hrobot.RobotClient + instanceCache servercache.ServerCache + cfg config.HCCMConfiguration + recorder record.EventRecorder + networkID int64 + cidr string + nodeLister corelisters.NodeLister } func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Interface, error) { @@ -144,13 +146,19 @@ func NewCloud(cidr string, nodeLister corelisters.NodeLister) (cloudprovider.Int klog.Infof("Hetzner Cloud k8s cloud controller %s started\n", providerVersion) + instanceCache, err := servercache.New(client, "instances_v2", cfg.Instance.Cache.Mode, cfg.Instance.Cache.TTL) + if err != nil { + return nil, fmt.Errorf("%s: %w", op, err) + } + return &cloud{ - client: client, - robotClient: robotClient, - cfg: cfg, - networkID: networkID, - cidr: cidr, - nodeLister: nodeLister, + client: client, + robotClient: robotClient, + instanceCache: instanceCache, + cfg: cfg, + networkID: networkID, + cidr: cidr, + nodeLister: nodeLister, }, nil } @@ -175,7 +183,7 @@ func (c *cloud) Instances() (cloudprovider.Instances, bool) { } func (c *cloud) InstancesV2() (cloudprovider.InstancesV2, bool) { - return newInstances(c.client, c.robotClient, c.recorder, c.networkID, c.cfg), true + return newInstances(c.client, c.robotClient, c.instanceCache, c.recorder, c.networkID, c.cfg), true } func (c *cloud) Zones() (cloudprovider.Zones, bool) { diff --git a/hcloud/cloud_test.go b/hcloud/cloud_test.go index 28e363c9..fa653642 100644 --- a/hcloud/cloud_test.go +++ b/hcloud/cloud_test.go @@ -31,6 +31,7 @@ import ( "k8s.io/client-go/tools/record" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/config" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" @@ -41,6 +42,7 @@ type testEnv struct { Mux *http.ServeMux Client *hcloud.Client RobotClient hrobot.RobotClient + ServerCache servercache.ServerCache Recorder record.EventRecorder Cfg config.HCCMConfiguration } @@ -51,6 +53,7 @@ func (env *testEnv) Teardown() { env.Mux = nil env.Client = nil env.RobotClient = nil + env.ServerCache = nil env.Recorder = nil } @@ -66,6 +69,7 @@ func newTestEnv() testEnv { ) robotClient := hrobot.NewBasicAuthClient("", "") robotClient.SetBaseURL(server.URL + "/robot") + serverCache := servercache.NewPerServerCache(client, "instances_v2", 10*time.Second) recorder := record.NewBroadcaster().NewRecorder(scheme.Scheme, corev1.EventSource{Component: "hcloud-cloud-controller-manager"}) cfg := config.HCCMConfiguration{} @@ -76,6 +80,7 @@ func newTestEnv() testEnv { Mux: mux, Client: client, RobotClient: robotClient, + ServerCache: serverCache, Recorder: recorder, Cfg: cfg, } diff --git a/hcloud/instances.go b/hcloud/instances.go index a6f7242b..e72e8f7f 100644 --- a/hcloud/instances.go +++ b/hcloud/instances.go @@ -33,6 +33,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/legacydatacenter" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/providerid" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -44,6 +45,7 @@ const ( type instances struct { client *hcloud.Client robotClient hrobot.RobotClient + serverCache servercache.ServerCache recorder record.EventRecorder networkID int64 cfg config.HCCMConfiguration @@ -57,6 +59,7 @@ var ( func newInstances( client *hcloud.Client, robotClient hrobot.RobotClient, + serverCache servercache.ServerCache, recorder record.EventRecorder, networkID int64, cfg config.HCCMConfiguration, @@ -64,6 +67,7 @@ func newInstances( return &instances{ client, robotClient, + serverCache, recorder, networkID, cfg, @@ -80,13 +84,12 @@ func (i *instances) lookupServer( if node.Spec.ProviderID != "" { var serverID int64 serverID, isCloudServer, err := providerid.ToServerID(node.Spec.ProviderID) - if err != nil { return nil, fmt.Errorf("failed to convert provider id to server id: %w", err) } if isCloudServer { - server, err := getCloudServerByID(ctx, i.client, serverID) + server, err := i.serverCache.ByID(ctx, serverID) if err != nil { return nil, fmt.Errorf("failed to get hcloud server \"%d\": %w", serverID, err) } @@ -115,7 +118,7 @@ func (i *instances) lookupServer( // If the node has no provider ID we try to find the server by name from // both sources. In case we find two servers, we return an error. - cloudServer, err := getCloudServerByName(ctx, i.client, node.Name) + cloudServer, err := i.serverCache.ByName(ctx, node.Name) if err != nil { return nil, fmt.Errorf("failed to get hcloud server %q: %w", node.Name, err) } @@ -153,6 +156,7 @@ func (i *instances) lookupServer( func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceExists" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceExists called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -165,6 +169,7 @@ func (i *instances) InstanceExists(ctx context.Context, node *corev1.Node) (bool func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bool, error) { const op = "hcloud/instancesv2.InstanceShutdown" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceShutdown called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { @@ -188,6 +193,7 @@ func (i *instances) InstanceShutdown(ctx context.Context, node *corev1.Node) (bo func (i *instances) InstanceMetadata(ctx context.Context, node *corev1.Node) (*cloudprovider.InstanceMetadata, error) { const op = "hcloud/instancesv2.InstanceMetadata" metrics.OperationCalled.WithLabelValues(op).Inc() + klog.V(4).InfoS("InstanceMetadata called", "node", node.Name, "providerID", node.Spec.ProviderID) server, err := i.lookupServer(ctx, node) if err != nil { diff --git a/hcloud/instances_test.go b/hcloud/instances_test.go index 5f4aef91..81871a3f 100644 --- a/hcloud/instances_test.go +++ b/hcloud/instances_test.go @@ -91,7 +91,7 @@ func TestInstances_InstanceExists(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) tests := []struct { name string @@ -104,7 +104,8 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: true, - }, { + }, + { name: "existing robot server by id", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -123,25 +124,29 @@ func TestInstances_InstanceExists(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-321"}, }, expected: true, - }, { + }, + { name: "missing server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: false, - }, { + }, + { name: "missing robot server by id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hrobot://322"}, }, expected: false, - }, { + }, + { name: "missing robot server by (legacy) id", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://bm-322"}, }, expected: false, - }, { + }, + { name: "existing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -149,7 +154,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "existing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -157,7 +163,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: true, - }, { + }, + { name: "missing server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -165,7 +172,8 @@ func TestInstances_InstanceExists(t *testing.T) { }, }, expected: false, - }, { + }, + { name: "missing robot server by name", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -211,7 +219,7 @@ func TestInstances_InstanceShutdown(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) env.Mux.HandleFunc("/robot/server/3", func(w http.ResponseWriter, _ *http.Request) { json.NewEncoder(w).Encode(hrobotmodels.ServerResponse{ Server: hrobotmodels.Server{ @@ -274,13 +282,15 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, }, expected: false, - }, { + }, + { name: "[cloud] shutdown", node: &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://2"}, }, expected: true, - }, { + }, + { name: "[robot] running", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -289,7 +299,8 @@ func TestInstances_InstanceShutdown(t *testing.T) { Spec: corev1.NodeSpec{ProviderID: "hrobot://3"}, }, expected: false, - }, { + }, + { name: "[robot] shutdown", node: &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ @@ -346,7 +357,7 @@ func TestInstances_InstanceMetadata(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ Spec: corev1.NodeSpec{ProviderID: "hcloud://1"}, @@ -390,7 +401,7 @@ func TestInstances_InstanceMetadataRobotServer(t *testing.T) { }) }) - instances := newInstances(env.Client, env.RobotClient, env.Recorder, 0, env.Cfg) + instances := newInstances(env.Client, env.RobotClient, env.ServerCache, env.Recorder, 0, env.Cfg) metadata, err := instances.InstanceMetadata(context.TODO(), &corev1.Node{ ObjectMeta: metav1.ObjectMeta{ diff --git a/hcloud/instances_util.go b/hcloud/instances_util.go index 370c68c9..a86f0b44 100644 --- a/hcloud/instances_util.go +++ b/hcloud/instances_util.go @@ -17,7 +17,6 @@ limitations under the License. package hcloud import ( - "context" "fmt" "regexp" "strings" @@ -26,9 +25,6 @@ import ( hrobotmodels "github.com/syself/hrobot-go/models" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" - - "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" - "github.com/hetznercloud/hcloud-go/v2/hcloud" ) type MockEventRecorder struct{} @@ -51,29 +47,6 @@ func (er *MockEventRecorder) AnnotatedEventf( ) { } -func getCloudServerByName(ctx context.Context, c *hcloud.Client, name string) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByName" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByName(ctx, name) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - - return server, nil -} - -func getCloudServerByID(ctx context.Context, c *hcloud.Client, id int64) (*hcloud.Server, error) { - const op = "hcloud/getCloudServerByID" - metrics.OperationCalled.WithLabelValues(op).Inc() - - server, _, err := c.Server.GetByID(ctx, id) - if err != nil { - return nil, fmt.Errorf("%s: %w", op, err) - } - return server, nil -} - func getRobotServerByName(c hrobot.RobotClient, node *corev1.Node) (server *hrobotmodels.Server, err error) { const op = "hcloud/getRobotServerByName" diff --git a/internal/config/config.go b/internal/config/config.go index 32c08f40..628f293c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -11,6 +11,7 @@ import ( "k8s.io/klog/v2" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-go/v2/hcloud" "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/kit/envutil" ) @@ -29,6 +30,8 @@ const ( robotForwardInternalIPs = "ROBOT_FORWARD_INTERNAL_IPS" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" + hcloudInstancesCacheMode = "HCLOUD_INSTANCES_CACHE_MODE" + hcloudInstancesCacheTTL = "HCLOUD_INSTANCES_CACHE_TTL" // Disable the "master/server is attached to the network" check against the metadata service. hcloudNetworkDisableAttachedCheck = "HCLOUD_NETWORK_DISABLE_ATTACHED_CHECK" @@ -67,8 +70,16 @@ const ( AddressFamilyIPv4 AddressFamily = "ipv4" ) +const InstanceCacheDefaultTTL time.Duration = 10 * time.Second + type InstanceConfiguration struct { AddressFamily AddressFamily + Cache InstanceConfigurationCache +} + +type InstanceConfigurationCache struct { + Mode servercache.Mode + TTL time.Duration } type LoadBalancerConfiguration struct { @@ -174,6 +185,26 @@ func Read() (HCCMConfiguration, error) { cfg.Instance.AddressFamily = AddressFamilyIPv4 } + // ---- Instance Cache ---- + + cfg.Instance.Cache = InstanceConfigurationCache{ + Mode: servercache.ModeAllServers, + TTL: InstanceCacheDefaultTTL, + } + + if mode, ok := os.LookupEnv(hcloudInstancesCacheMode); ok { + cfg.Instance.Cache.Mode = servercache.Mode(mode) + } + + if ttlStr, ok := os.LookupEnv(hcloudInstancesCacheTTL); ok { + ttl, err := time.ParseDuration(ttlStr) + if err != nil { + errs = append(errs, fmt.Errorf("invalid value for %q: %w", hcloudInstancesCacheTTL, err)) + } else { + cfg.Instance.Cache.TTL = ttl + } + } + cfg.LoadBalancer.Enabled, err = getEnvBool(hcloudLoadBalancersEnabled, true) if err != nil { errs = append(errs, err) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index d6fb65ed..879cab2e 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ import ( "github.com/stretchr/testify/assert" + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/servercache" "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/testsupport" "github.com/hetznercloud/hcloud-go/v2/hcloud" ) @@ -25,7 +26,7 @@ func TestRead(t *testing.T) { want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -48,7 +49,7 @@ func TestRead(t *testing.T) { HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", AttachedCheckEnabled: true, @@ -85,7 +86,7 @@ func TestRead(t *testing.T) { ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -141,7 +142,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or }, Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -170,7 +171,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: false, Address: "127.0.0.1:9999"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -201,7 +202,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: true, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -233,7 +234,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or ForwardInternalIPs: false, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -253,7 +254,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv6, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -274,7 +275,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -297,7 +298,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Enabled: true, PrivateIngressEnabled: true, @@ -324,7 +325,7 @@ failed to read ROBOT_PASSWORD_FILE: open /tmp/hetzner-password: no such file or want: HCCMConfiguration{ Robot: RobotConfiguration{CacheTimeout: 5 * time.Minute}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ AttachedCheckEnabled: true, }, @@ -413,7 +414,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "minimal", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: nil, }, @@ -423,7 +424,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Network: NetworkConfiguration{ NameOrID: "foobar", }, @@ -437,7 +438,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "token missing", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: ""}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: errors.New("environment variable \"HCLOUD_TOKEN\" is required"), }, @@ -445,7 +446,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "address family invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar")}, + Instance: InstanceConfiguration{AddressFamily: AddressFamily("foobar"), Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, }, wantErr: errors.New("invalid value for \"HCLOUD_INSTANCES_ADDRESS_FAMILY\", expect one of: ipv4,ipv6,dualstack"), }, @@ -453,7 +454,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB location and network zone set", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ Location: "nbg1", NetworkZone: "eu-central", @@ -465,7 +466,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "LB private subnet invalid cidr", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ PrivateSubnetIPRange: "10.0.0.0/33", }, @@ -476,7 +477,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "algorithm type invalid", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, LoadBalancer: LoadBalancerConfiguration{ AlgorithmType: "invalid", }, @@ -487,7 +488,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled without credentials (valid)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -499,7 +500,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only user)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -513,7 +514,7 @@ func TestHCCMConfiguration_Validate(t *testing.T) { name: "robot enabled with partial credentials (only password)", fields: fields{ HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Robot: RobotConfiguration{ Enabled: true, @@ -526,9 +527,8 @@ func TestHCCMConfiguration_Validate(t *testing.T) { { name: "robot & routes activated", fields: fields{ - HCloudClient: HCloudClientConfiguration{Token: "jr5g7ZHpPptyhJzZyHw2Pqu4g9gTqDvEceYpngPf79jN_NOT_VALID_dzhepnahq"}, - Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, + Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4, Cache: InstanceConfigurationCache{Mode: servercache.ModeAllServers, TTL: 10 * time.Second}}, Route: RouteConfiguration{Enabled: true}, Robot: RobotConfiguration{ Enabled: true, diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 46663e62..d17d79a7 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -37,10 +37,15 @@ var ( Name: "cloud_controller_manager_operations_total", Help: "The total number of operation was called", }, []string{"op"}) + + CacheRequests = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "cloud_controller_manager_cache_requests_total", + Help: "Total cache requests partitioned by cache name and result.", + }, []string{"subsystem", "mode", "result"}) ) func init() { - GetRegistry().MustRegister(OperationCalled) + GetRegistry().MustRegister(OperationCalled, CacheRequests) } func GetRegistry() prometheus.Registerer { diff --git a/internal/servercache/allservercache.go b/internal/servercache/allservercache.go new file mode 100644 index 00000000..f84c64cd --- /dev/null +++ b/internal/servercache/allservercache.go @@ -0,0 +1,119 @@ +package servercache + +import ( + "context" + "sync" + "time" + + "golang.org/x/time/rate" + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +var _ ServerCache = (*AllServerCache)(nil) + +type AllServerCache struct { + subsystem string + mode Mode + ttl time.Duration + expiresAt time.Time + + client *hcloud.Client + + byID map[int64]*hcloud.Server + byName map[string]*hcloud.Server + + limiter *rate.Limiter + mu sync.Mutex +} + +func NewAllServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *AllServerCache { + return &AllServerCache{ + subsystem: subsystem, + mode: ModeAllServers, + ttl: ttl, + client: client, + expiresAt: time.Now(), + limiter: rate.NewLimiter(rate.Every(ttl), 1), + } +} + +// ByID implements [ServerCache]. +func (c *AllServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.getFromCache(ctx, func() *hcloud.Server { return c.byID[id] }) +} + +// ByName implements [ServerCache]. +func (c *AllServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.getFromCache(ctx, func() *hcloud.Server { return c.byName[name] }) +} + +func (c *AllServerCache) refresh(ctx context.Context) error { + klog.V(4).InfoS("all-server cache: refreshing from api") + servers, err := c.client.Server.All(ctx) + if err != nil { + return err + } + + c.byID = make(map[int64]*hcloud.Server, len(servers)) + c.byName = make(map[string]*hcloud.Server, len(servers)) + + for _, server := range servers { + c.byID[server.ID] = server + c.byName[server.Name] = server + } + + c.expiresAt = time.Now().Add(c.ttl) + klog.V(4).InfoS("all-server cache: refresh complete", "count", len(servers), "expiresAt", c.expiresAt) + + return nil +} + +func (c *AllServerCache) getFromCache(ctx context.Context, lookup func() *hcloud.Server) (*hcloud.Server, error) { + c.mu.Lock() + defer c.mu.Unlock() + + cacheRefreshed := false + if time.Now().After(c.expiresAt) { + klog.V(4).InfoS("all-server cache: cache expired, refreshing", "expiresAt", c.expiresAt) + if err := c.refresh(ctx); err != nil { + return nil, err + } + cacheRefreshed = true + } + + if server := lookup(); server != nil { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() + klog.V(4).InfoS("all-server cache hit", "id", server.ID, "name", server.Name) + return server, nil + } + + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() + + // Server not found on fresh cache so return early. + if cacheRefreshed { + klog.V(4).InfoS("all-server cache: server not found in fresh snapshot") + return nil, nil + } + + // Cache was not refreshed and rate limiter does not allow refreshing right now. + if !c.limiter.Allow() { + klog.V(4).InfoS("all-server cache: miss-driven refresh denied by rate limiter") + return nil, ErrRateLimited + } + + klog.V(4).InfoS("all-server cache miss: refreshing to catch newly-created server") + if err := c.refresh(ctx); err != nil { + return nil, err + } + + server := lookup() + if server == nil { + klog.V(4).InfoS("all-server cache: server not found after refresh") + } else { + klog.V(4).InfoS("all-server cache: server found after refresh", "id", server.ID, "name", server.Name) + } + return server, nil +} diff --git a/internal/servercache/allservercache_test.go b/internal/servercache/allservercache_test.go new file mode 100644 index 00000000..e1349721 --- /dev/null +++ b/internal/servercache/allservercache_test.go @@ -0,0 +1,195 @@ +package servercache + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "golang.org/x/time/rate" + + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +// allServersPath is the canonical response to `Server.All()` used by the +// AllServerCache tests. [hcloud.ServerClient.All] paginates with per_page=50. +const allServersPath = "/servers?page=1&per_page=50" + +func TestAllServerCache_ByID_HitAfterMiss(t *testing.T) { + // One refresh is expected; the second lookup within TTL must not trigger another. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{ + {ID: 1, Name: "server-1"}, + {ID: 2, Name: "server-2"}, + }}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByID(t.Context(), 2) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-2", srv.Name) +} + +func TestAllServerCache_ByName_HitAfterMiss(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestAllServerCache_TTLExpiry(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", 0) + + _, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + _, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) +} + +func TestAllServerCache_MissTriggersRefresh(t *testing.T) { + // Initially only server 1 is returned. A lookup for id=2 triggers a refresh + // that now also returns server 2 (e.g. because it was just created). + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{ + {ID: 1, Name: "server-1"}, + {ID: 2, Name: "server-2"}, + }}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter + + _, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + + srv, err := cache.ByID(t.Context(), 2) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-2", srv.Name) +} + +func TestAllServerCache_ServerNotFoundAfterRefresh(t *testing.T) { + // A missing server triggers a refresh on every lookup, since we have no + // way to distinguish "just created" from "does not exist". + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Inf, 1) // isolate the test from the limiter + + srv, err := cache.ByID(t.Context(), 999) + require.NoError(t, err) + assert.Nil(t, srv) + + srv, err = cache.ByID(t.Context(), 999) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestAllServerCache_RateLimitedRefreshReturnsErr(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + + // Drain the limiter so the next miss-driven refresh is denied. + require.True(t, cache.limiter.Allow()) + + srv, err = cache.ByID(t.Context(), 999) + require.ErrorIs(t, err, ErrRateLimited) + assert.Nil(t, srv) +} + +func TestAllServerCache_ExpiredRefreshFailureDoesNotConsumeLimiter(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + { + Method: "GET", Path: allServersPath, Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + cache.limiter = rate.NewLimiter(rate.Every(time.Hour), 1) + + _, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.NotErrorIs(t, err, ErrRateLimited) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) +} + +func TestAllServerCache_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: allServersPath, + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewAllServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} diff --git a/internal/servercache/evalservercache.go b/internal/servercache/evalservercache.go new file mode 100644 index 00000000..c1df8664 --- /dev/null +++ b/internal/servercache/evalservercache.go @@ -0,0 +1,63 @@ +package servercache + +import ( + "context" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +// EvalCache wraps a [PerServerCache], an [AllServerCache], and a [Passthrough] and +// runs each cache sequentially for every lookup. + +var _ ServerCache = (*EvalCache)(nil) + +type EvalCache struct { + caches []ServerCache +} + +func NewEvalCache(client *hcloud.Client, subsystem string, ttl time.Duration) *EvalCache { + caches := []ServerCache{ + NewAllServerCache(client, subsystem, ttl), + NewPerServerCache(client, subsystem, ttl), + NewPassthrough(client), + } + + return &EvalCache{ + caches: caches, + } +} + +func (c *EvalCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByID(ctx, id) }) +} + +func (c *EvalCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.run(func(s ServerCache) (*hcloud.Server, error) { return s.ByName(ctx, name) }) +} + +// run invokes every underlying cache so each one records its own metrics for +// the same request, then returns the first cache's result as the authoritative +// answer. Errors from secondary caches are logged but not propagated, so a +// transient failure in a comparison cache cannot affect the controller. +func (c *EvalCache) run(lookup func(ServerCache) (*hcloud.Server, error)) (*hcloud.Server, error) { + var ( + firstServer *hcloud.Server + firstErr error + ) + + for i, cache := range c.caches { + server, err := lookup(cache) + if i == 0 { + firstServer, firstErr = server, err + continue + } + if err != nil { + klog.V(4).InfoS("eval cache: secondary cache returned error", "index", i, "err", err) + } + } + + return firstServer, firstErr +} diff --git a/internal/servercache/perservercache.go b/internal/servercache/perservercache.go new file mode 100644 index 00000000..d54ad3e5 --- /dev/null +++ b/internal/servercache/perservercache.go @@ -0,0 +1,123 @@ +package servercache + +import ( + "context" + "sync" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +var _ ServerCache = (*PerServerCache)(nil) + +// PerServerCache caches each Server with a separate expiration time. +// The caches removes expired entries. +type PerServerCache struct { + subsystem string + mode Mode + ttl time.Duration + + client *hcloud.Client + + byID map[int64]*perServerCacheEntry + byName map[string]*perServerCacheEntry + + mu sync.Mutex +} + +type perServerCacheEntry struct { + server *hcloud.Server + expiresAt time.Time +} + +func NewPerServerCache(client *hcloud.Client, subsystem string, ttl time.Duration) *PerServerCache { + return &PerServerCache{ + subsystem: subsystem, + mode: ModePerServer, + ttl: ttl, + client: client, + byID: make(map[int64]*perServerCacheEntry), + byName: make(map[string]*perServerCacheEntry), + } +} + +func (c *PerServerCache) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + return c.getOrFetch( + func() *perServerCacheEntry { return c.byID[id] }, + func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByID(ctx, id) }, + ) +} + +func (c *PerServerCache) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + return c.getOrFetch( + func() *perServerCacheEntry { return c.byName[name] }, + func() (*hcloud.Server, *hcloud.Response, error) { return c.client.Server.GetByName(ctx, name) }, + ) +} + +func (c *PerServerCache) getOrFetch( + lookup func() *perServerCacheEntry, + fetch func() (*hcloud.Server, *hcloud.Response, error), +) (*hcloud.Server, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if entry := lookup(); entry != nil && time.Now().Before(entry.expiresAt) { + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "hit").Inc() + klog.V(4).InfoS("per-server cache hit", "id", entry.server.ID, "name", entry.server.Name) + return entry.server, nil + } + + klog.V(4).InfoS("per-server cache miss, fetching from api") + server, _, err := fetch() + if err != nil { + return nil, err + } + metrics.CacheRequests.WithLabelValues(c.subsystem, string(c.mode), "miss").Inc() + if server != nil { + klog.V(4).InfoS("per-server cache: fetched server from api", "id", server.ID, "name", server.Name) + c.addToCache(server) + } else { + klog.V(4).InfoS("per-server cache: server not found via api") + } + + return server, nil +} + +// addToCache inserts (or refreshes) a server in the cache, evicting the +// expired entries. +// The caller must hold the mutex. +func (c *PerServerCache) addToCache(server *hcloud.Server) { + if existing, ok := c.byID[server.ID]; ok { + // Server name changed and needs updating. + if existing.server.Name != server.Name { + delete(c.byName, existing.server.Name) + } + existing.server = server + existing.expiresAt = time.Now().Add(c.ttl) + c.byName[server.Name] = existing + return + } + + // Create new server entry. + entry := &perServerCacheEntry{ + server: server, + expiresAt: time.Now().Add(c.ttl), + } + c.byID[server.ID] = entry + c.byName[server.Name] = entry + + // Evict expired entries to avoid deleted Servers being around + // until hccm restarts. + for key := range c.byID { + oldEntry := c.byID[key] + if time.Now().After(oldEntry.expiresAt) { + delete(c.byID, key) + delete(c.byName, oldEntry.server.Name) + klog.V(4).InfoS("per-server cache: evicted LRU entry", "id", key) + } + } +} diff --git a/internal/servercache/perservercache_test.go b/internal/servercache/perservercache_test.go new file mode 100644 index 00000000..c2170dc8 --- /dev/null +++ b/internal/servercache/perservercache_test.go @@ -0,0 +1,165 @@ +package servercache + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +func TestPerServerCache_ByID_HitAfterMiss(t *testing.T) { + // Exactly one GET /servers/1 is expected; the second ByID must be served from cache. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPerServerCache_ByName_HitAfterMiss(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", + Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPerServerCache_ByID_CrossPopulatesByName(t *testing.T) { + // Only the initial GetByID call is expected; the subsequent ByName must hit + // the cache populated by ByID. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) +} + +func TestPerServerCache_TTLExpiry(t *testing.T) { + // Zero TTL → every lookup misses and triggers a fresh GET. + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusStarting)}}, + }, + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1", Status: string(hcloud.ServerStatusRunning)}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", 0) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, hcloud.ServerStatusStarting, srv.Status) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, hcloud.ServerStatusRunning, srv.Status) +} + +func TestPerServerCache_ServerNotFound(t *testing.T) { + // A nil result is not cached → both lookups must hit the api. + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) + + srv, err = cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPerServerCache_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPerServerCache(client, "instances_v2", time.Minute) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} + +func TestPerServerCache_ExpiredEviction(t *testing.T) { + // Cache size 2; adding a third entry evicts the least-recently-used. + // Touching server 1 before inserting 3 keeps 1 in cache and evicts 2. + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/1", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}}, + {Method: "GET", Path: "/servers/2", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 2, Name: "server-2"}}}, + {Method: "GET", Path: "/servers/3", Status: http.StatusOK, JSON: schema.ServerGetResponse{Server: schema.Server{ID: 3, Name: "server-3"}}}, + }) + cache := NewPerServerCache(client, "instances_v2", time.Hour) + + ctx := t.Context() + _, err := cache.ByID(ctx, 1) + require.NoError(t, err) + _, err = cache.ByID(ctx, 2) + require.NoError(t, err) + + // Expire 2 + cache.byID[2].expiresAt = time.Unix(0, 0) + + // Adding 3 evicts the expired entry (2). + _, err = cache.ByID(ctx, 3) + require.NoError(t, err) + + assert.Len(t, cache.byID, 2) + assert.Contains(t, cache.byID, int64(1)) + assert.Contains(t, cache.byID, int64(3)) + assert.NotContains(t, cache.byID, int64(2)) + assert.Len(t, cache.byName, 2) + assert.NotContains(t, cache.byName, "server-2") +} diff --git a/internal/servercache/servercache.go b/internal/servercache/servercache.go new file mode 100644 index 00000000..adc297b2 --- /dev/null +++ b/internal/servercache/servercache.go @@ -0,0 +1,80 @@ +package servercache + +import ( + "context" + "errors" + "fmt" + "time" + + "k8s.io/klog/v2" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" +) + +// ServerCache defines a caching layer for retrieving Hetzner Cloud servers. +type ServerCache interface { + // ByID retrieves a server by its unique numeric ID. + // Returns the server if found, nil and no error if not found, + // or nil and an error if the lookup fails. + ByID(context.Context, int64) (*hcloud.Server, error) + + // ByName retrieves a server by its name. + // Returns the server if found, nil and no error if not found, + // or nil and an error if the lookup fails. + ByName(context.Context, string) (*hcloud.Server, error) +} + +// ErrRateLimited is returned by a [ServerCache] when a lookup would have +// required a refresh but the cache's internal rate limiter denied it. +var ErrRateLimited = errors.New("refresh_rate_limited") + +type Mode string + +const ( + ModeAllServers Mode = "all-server" + ModePerServer Mode = "per-server" + ModeEval Mode = "eval" + ModeOff Mode = "off" +) + +func New(client *hcloud.Client, subsystem string, mode Mode, ttl time.Duration) (ServerCache, error) { + if mode != ModeOff { + klog.Warningf("instance caching is experimental, breaking changes may occur within minor releases; set HCLOUD_INSTANCES_CACHE_MODE=off to opt out (mode=%q)", mode) + } + switch mode { + case ModeAllServers: + return NewAllServerCache(client, subsystem, ttl), nil + case ModePerServer: + return NewPerServerCache(client, subsystem, ttl), nil + case ModeEval: + klog.Warningf("instance cache mode %q is for internal evaluation only and is not intended for production use", mode) + return NewEvalCache(client, subsystem, ttl), nil + case ModeOff: + return NewPassthrough(client), nil + } + return nil, fmt.Errorf("invalid cache mode %q", mode) +} + +// ----- Passthrough ----- + +// Passthrough is a [ServerCache] that always queries the API. + +var _ ServerCache = (*Passthrough)(nil) + +type Passthrough struct { + client *hcloud.Client +} + +func NewPassthrough(client *hcloud.Client) *Passthrough { + return &Passthrough{client: client} +} + +func (c *Passthrough) ByID(ctx context.Context, id int64) (*hcloud.Server, error) { + server, _, err := c.client.Server.GetByID(ctx, id) + return server, err +} + +func (c *Passthrough) ByName(ctx context.Context, name string) (*hcloud.Server, error) { + server, _, err := c.client.Server.GetByName(ctx, name) + return server, err +} diff --git a/internal/servercache/servercache_test.go b/internal/servercache/servercache_test.go new file mode 100644 index 00000000..d1711754 --- /dev/null +++ b/internal/servercache/servercache_test.go @@ -0,0 +1,161 @@ +package servercache + +import ( + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/hetznercloud/hcloud-go/v2/hcloud" + "github.com/hetznercloud/hcloud-go/v2/hcloud/exp/mockutil" + "github.com/hetznercloud/hcloud-go/v2/hcloud/schema" +) + +// newCacheTestClient spins up a [mockutil.Server] and returns a client pointed +// at it. +func newCacheTestClient(t *testing.T, requests []mockutil.Request) *hcloud.Client { + t.Helper() + server := mockutil.NewServer(t, requests) + return hcloud.NewClient( + hcloud.WithEndpoint(server.URL), + hcloud.WithPollOpts(hcloud.PollOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), + hcloud.WithRetryOpts(hcloud.RetryOpts{BackoffFunc: hcloud.ConstantBackoff(0)}), + ) +} + +var notFoundResponse = schema.ErrorResponse{Error: schema.Error{Code: string(hcloud.ErrorCodeNotFound), Message: "not found"}} + +func TestNew(t *testing.T) { + client := hcloud.NewClient() + + tests := []struct { + name string + mode Mode + wantType any + wantErr bool + }{ + {name: "all-server", mode: ModeAllServers, wantType: (*AllServerCache)(nil)}, + {name: "per-server", mode: ModePerServer, wantType: (*PerServerCache)(nil)}, + {name: "eval", mode: ModeEval, wantType: (*EvalCache)(nil)}, + {name: "off", mode: ModeOff, wantType: (*Passthrough)(nil)}, + {name: "invalid", mode: Mode("bogus"), wantErr: true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cache, err := New(client, "instances_v2", tt.mode, time.Minute) + if tt.wantErr { + require.Error(t, err) + assert.Nil(t, cache) + return + } + require.NoError(t, err) + require.NotNil(t, cache) + assert.IsType(t, tt.wantType, cache) + }) + } +} + +func TestPassthrough_ByID(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + // Passthrough never caches, so a second lookup must hit the API again. + { + Method: "GET", Path: "/servers/1", Status: http.StatusOK, + JSON: schema.ServerGetResponse{Server: schema.Server{ID: 1, Name: "server-1"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, int64(1), srv.ID) + + srv, err = cache.ByID(t.Context(), 1) + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPassthrough_ByID_NotFound(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + {Method: "GET", Path: "/servers/42", Status: http.StatusNotFound, JSON: notFoundResponse}, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 42) + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByID_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers/1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByID(t.Context(), 1) + require.Error(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByName(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + { + Method: "GET", Path: "/servers?name=server-1", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{{ID: 1, Name: "server-1"}}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) + assert.Equal(t, "server-1", srv.Name) + + srv, err = cache.ByName(t.Context(), "server-1") + require.NoError(t, err) + require.NotNil(t, srv) +} + +func TestPassthrough_ByName_NotFound(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=missing", Status: http.StatusOK, + JSON: schema.ServerListResponse{Servers: []schema.Server{}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "missing") + require.NoError(t, err) + assert.Nil(t, srv) +} + +func TestPassthrough_ByName_APIError(t *testing.T) { + client := newCacheTestClient(t, []mockutil.Request{ + { + Method: "GET", Path: "/servers?name=server-1", + Status: http.StatusInternalServerError, + JSON: schema.ErrorResponse{Error: schema.Error{Code: "boom", Message: "upstream exploded"}}, + }, + }) + cache := NewPassthrough(client) + + srv, err := cache.ByName(t.Context(), "server-1") + require.Error(t, err) + assert.Nil(t, srv) +}