diff --git a/api/v1alpha1/common_types.go b/api/v1alpha1/common_types.go index cc4c9ddb..0681a6f0 100644 --- a/api/v1alpha1/common_types.go +++ b/api/v1alpha1/common_types.go @@ -193,14 +193,23 @@ type SidecarConfig struct { Resources *corev1.ResourceRequirements `json:"resources,omitempty"` // TLS, if set, fronts the sidecar API with kube-rbac-proxy on - // :8443 backed by a cert-manager-issued cert. Init-only: - // toggling on a Running SeiNode requires recreation. See seictl#165. + // :8443 backed by a cert-manager-issued cert. Toggling on a + // Running SeiNode triggers a NodeUpdate plan that provisions + // the Certificate + ConfigMap and cycles the pod. // +optional TLS *SidecarTLSSpec `json:"tls,omitempty"` } // SidecarTLSSpec configures the cert-manager-issued serving cert for // the kube-rbac-proxy fronting. +// +// SECURITY: IssuerName and IssuerKind select the trust anchor that +// signs the proxy's serving cert. They are operator-trusted input — +// the controller does not validate that the referenced Issuer is +// platform-blessed. Clusters running multiple chains should gate +// writes via a ValidatingAdmissionPolicy that pins IssuerName to the +// per-chain CA, since a spec edit otherwise can reroute the trust +// anchor for an already-Running node. type SidecarTLSSpec struct { // IssuerName references a cert-manager Issuer or ClusterIssuer // that signs the proxy's serving certificate. @@ -213,3 +222,19 @@ type SidecarTLSSpec struct { // +optional IssuerKind string `json:"issuerKind,omitempty"` } + +// SidecarTLSStatus is the observed sidecar TLS configuration. Nil means +// TLS is not enabled on the running pod. Stamped by the ObserveSidecarTLS +// task when a TLS-toggle NodeUpdate plan completes; compared against +// spec.sidecar.tls to detect drift. +// +// Reflects controller intent (StatefulSet rollout convergence) rather +// than pod attestation — i.e., what the controller asked for, not a +// proof of what each container reports. Downstream consumers should +// not treat this field as authoritative for trust decisions. +type SidecarTLSStatus struct { + IssuerName string `json:"issuerName"` + + // +kubebuilder:validation:Enum=Issuer;ClusterIssuer + IssuerKind string `json:"issuerKind"` +} diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index e8ad5035..57f2f823 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -316,6 +316,14 @@ type SeiNodeStatus struct { // +optional CurrentImage string `json:"currentImage,omitempty"` + // CurrentSidecarTLS captures the sidecar TLS configuration observed + // running on the owned StatefulSet. Stamped by the ObserveSidecarTLS + // task when a TLS-toggle NodeUpdate plan completes. Compared against + // spec.sidecar.tls to detect drift. Nil means TLS is not enabled on + // the running pod. + // +optional + CurrentSidecarTLS *SidecarTLSStatus `json:"currentSidecarTLS,omitempty"` + // +listType=map // +listMapKey=type // +optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index efa79705..a82824dc 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -957,6 +957,11 @@ func (in *SeiNodeSpec) DeepCopy() *SeiNodeSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SeiNodeStatus) DeepCopyInto(out *SeiNodeStatus) { *out = *in + if in.CurrentSidecarTLS != nil { + in, out := &in.CurrentSidecarTLS, &out.CurrentSidecarTLS + *out = new(SidecarTLSStatus) + **out = **in + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -1117,6 +1122,21 @@ func (in *SidecarTLSSpec) DeepCopy() *SidecarTLSSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SidecarTLSStatus) DeepCopyInto(out *SidecarTLSStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SidecarTLSStatus. +func (in *SidecarTLSStatus) DeepCopy() *SidecarTLSStatus { + if in == nil { + return nil + } + out := new(SidecarTLSStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SigningKeySource) DeepCopyInto(out *SigningKeySource) { *out = *in diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 329ad8db..e2f7be4d 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -612,8 +612,9 @@ spec: tls: description: |- TLS, if set, fronts the sidecar API with kube-rbac-proxy on - :8443 backed by a cert-manager-issued cert. Init-only: - toggling on a Running SeiNode requires recreation. See seictl#165. + :8443 backed by a cert-manager-issued cert. Toggling on a + Running SeiNode triggers a NodeUpdate plan that provisions + the Certificate + ConfigMap and cycles the pod. properties: issuerKind: default: ClusterIssuer diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index dad2a1cb..1ee879b7 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -467,8 +467,9 @@ spec: tls: description: |- TLS, if set, fronts the sidecar API with kube-rbac-proxy on - :8443 backed by a cert-manager-issued cert. Init-only: - toggling on a Running SeiNode requires recreation. See seictl#165. + :8443 backed by a cert-manager-issued cert. Toggling on a + Running SeiNode triggers a NodeUpdate plan that provisions + the Certificate + ConfigMap and cycles the pod. properties: issuerKind: default: ClusterIssuer @@ -839,6 +840,25 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLS: + description: |- + CurrentSidecarTLS captures the sidecar TLS configuration observed + running on the owned StatefulSet. Stamped by the ObserveSidecarTLS + task when a TLS-toggle NodeUpdate plan completes. Compared against + spec.sidecar.tls to detect drift. Nil means TLS is not enabled on + the running pod. + properties: + issuerKind: + enum: + - Issuer + - ClusterIssuer + type: string + issuerName: + type: string + required: + - issuerKind + - issuerName + type: object externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node, diff --git a/docs/design-seinode-sidecar-tls-toggle-lld.md b/docs/design-seinode-sidecar-tls-toggle-lld.md new file mode 100644 index 00000000..0cf0f2a8 --- /dev/null +++ b/docs/design-seinode-sidecar-tls-toggle-lld.md @@ -0,0 +1,231 @@ +# Design: SeiNode — Sidecar TLS Toggle on Running Nodes (LLD) + +**Status:** Draft / LLD +**Date:** 2026-05-15 +**Tracks:** prod TLS rollout (arctic-1, atlantic-2, pacific-1) +**Related:** sei-protocol/platform#545 (per-chain internal CA issuers), seictl#165 (init-only TLS gap) + +## 0. Background + +`spec.sidecar.tls` provisions a kube-rbac-proxy fronting the sei-sidecar API on `:8443`. Today it is only emitted by the **init plan** (`internal/planner/planner.go:531-535`): + +```go +if noderesource.SidecarTLSEnabled(node) { + prog = append(prog, task.TaskTypeApplySidecarCert, task.TaskTypeApplyRBACProxyConfig) +} +``` + +The Running-state drift detector (`internal/planner/planner.go:708-716`) checks only image drift + sidecar re-approval: + +```go +func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + if node.Spec.Image != node.Status.CurrentImage { + return buildNodeUpdatePlan(node) + } + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil +} +``` + +Adding `spec.sidecar.tls` to a Running SeiNode is silently ignored. The current workaround is `kubectl delete seinode` per child to force the init plan via the parent SND — operationally hostile for fleet rollouts. + +## 0.1 Design choice: extend NodeUpdate, do not introduce a parallel plan + +The kube-rbac-proxy container shares a pod with `seid`. A TLS toggle requires the same pod cycle as an image rollout — `ApplyStatefulSet → ApplyService → ReplacePod` — differing only in pre-tasks (cert + configmap provisioning) and post-tasks (which `Observe*` stamps which `status.current*` field). There is no "cycle just the sidecar" path available today. + +A parallel `SidecarReprovision` plan + a separate `SidecarTLSToggleInProgress` condition were considered and rejected: +- The two plans share the entire pod-cycle middle. Duplication, not specialization. +- When both image and TLS drift simultaneously, an "image takes precedence" rule resolves TLS implicitly via `ApplyStatefulSet` regeneration but never stamps `status.currentSidecarTLS` — next reconcile fires a redundant second pod cycle. +- The `SidecarReprovision` name was speculative scaffolding for future `spec.sidecar` subfields with no concrete consumer today (YAGNI). +- Distinguishing image rollout vs. TLS toggle for observability is what `condition.reason` is for. + +Both drifts flow through one extended `buildNodeUpdatePlan`. One pod cycle. One condition. Reason discriminates cause. + +## 1. CRD schema changes + +One new optional status field. No spec changes — the trigger is already `spec.sidecar.tls`. + +```go +// api/v1alpha1/seinode_types.go — SeiNodeStatus + +// CurrentSidecarTLS captures the sidecar TLS configuration observed +// running on the owned StatefulSet. Updated when the NodeUpdate plan's +// ObserveSidecarTLS task completes. Compared against spec.sidecar.tls +// to detect drift. +// +optional +CurrentSidecarTLS *SidecarTLSStatus `json:"currentSidecarTLS,omitempty"` +``` + +```go +// api/v1alpha1/common_types.go + +// SidecarTLSStatus is the observed sidecar TLS configuration. Nil means +// TLS is not enabled on the running pod. +type SidecarTLSStatus struct { + IssuerName string `json:"issuerName"` + IssuerKind string `json:"issuerKind"` +} +``` + +No new condition constants. `ConditionNodeUpdateInProgress` covers both image rollout and TLS toggle. Reasons: + +| Reason | When | +|---------------------------------|--------------------------------------------| +| `UpdateStarted` | image drift only | +| `TLSToggleStarted` | TLS drift only | +| `UpdateAndTLSToggleStarted` | both drifts present | +| `UpdateComplete` / `UpdateFailed` | existing terminal reasons (unchanged) | + +Doc-comment correction on `SidecarConfig.TLS` (`api/v1alpha1/common_types.go:195-198`) — drop the "Init-only" claim. + +## 2. Drift detection + +Extend `buildRunningPlan` to compute both drift flags and dispatch once: + +```go +// internal/planner/planner.go + +func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + imageDrift := node.Spec.Image != node.Status.CurrentImage + tlsDrift := sidecarTLSDrift(node) + if imageDrift || tlsDrift { + return buildNodeUpdatePlan(node, imageDrift, tlsDrift) + } + if sidecarNeedsReapproval(node) { + return buildMarkReadyPlan(node) + } + return nil, nil +} + +// sidecarTLSDrift reports whether spec.sidecar.tls and +// status.currentSidecarTLS disagree on enablement or Issuer. +// +// Enable-only for this PR: returns true when spec is set and current is +// nil or has mismatched issuer. spec=nil + current=set (disable) returns +// false here and is tracked separately (see §6). +func sidecarTLSDrift(node *seiv1alpha1.SeiNode) bool { + spec := node.Spec.Sidecar.GetTLS() // nil-safe accessor + if spec == nil { + return false + } + cur := node.Status.CurrentSidecarTLS + if cur == nil { + return true + } + return spec.IssuerName != cur.IssuerName || spec.IssuerKind != cur.IssuerKind +} +``` + +## 3. Plan shape — extended `buildNodeUpdatePlan` + +```go +// internal/planner/planner.go + +func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode, imageDrift, tlsDrift bool) (*seiv1alpha1.TaskPlan, error) { + setNodeUpdateCondition(node, metav1.ConditionTrue, + nodeUpdateReason(imageDrift, tlsDrift), + nodeUpdateMessage(node, imageDrift, tlsDrift)) + + var prog []string + if tlsDrift && noderesource.SidecarTLSEnabled(node) { + prog = append(prog, + task.TaskTypeApplySidecarCert, // SSA Certificate + task.TaskTypeWaitForSidecarTLSSecret, // poll until tls.crt populated + task.TaskTypeApplyRBACProxyConfig, // SSA ConfigMap + ) + } + prog = append(prog, + task.TaskTypeApplyStatefulSet, // full-spec regeneration + task.TaskTypeApplyService, // adds :8443 when TLS enabled + task.TaskTypeReplacePod, // single pod cycle covers both + ) + if imageDrift { + prog = append(prog, task.TaskTypeObserveImage) + } + if tlsDrift { + prog = append(prog, task.TaskTypeObserveSidecarTLS) + } + prog = append(prog, sidecar.TaskTypeMarkReady) + + planID := uuid.New().String() + tasks := make([]seiv1alpha1.PlannedTask, len(prog)) + for i, taskType := range prog { + t, err := buildPlannedTask(planID, taskType, i, paramsForTaskType(node, taskType, nil, nil)) + if err != nil { + return nil, err + } + tasks[i] = t + } + return &seiv1alpha1.TaskPlan{ + ID: planID, + Phase: seiv1alpha1.TaskPlanActive, + Tasks: tasks, + TargetPhase: seiv1alpha1.PhaseRunning, + // FailedPhase empty: failure retries, does not transition out of Running. + }, nil +} +``` + +`nodeUpdateReason` returns `UpdateStarted` / `TLSToggleStarted` / `UpdateAndTLSToggleStarted` based on the flags. `nodeUpdateMessage` includes spec/current diffs for whichever drift(s) fired. + +`WaitForSidecarTLSSecret` is **load-bearing**. Cert-manager issuance is async (seconds to minutes). Without the wait, `ApplyStatefulSet` schedules a pod whose proxy container references a not-yet-existing Secret; kubelet fails the mount, backoff escalates to 5min, the pod looks broken to operators. With the wait, the StatefulSet apply happens only when the Secret is present. + +Co-drift case (image + TLS in the same edit): both blocks of pre-tasks (none for image) and post-tasks (`ObserveImage` + `ObserveSidecarTLS`) run in one plan. One pod cycle. Both `status.currentImage` and `status.currentSidecarTLS` get stamped before the plan terminates. No redundant second reconcile. + +`classifyPlan` (`internal/planner/planner.go:209`) detects on `TaskTypeObserveImage` today. Extend it to also detect `TaskTypeObserveSidecarTLS` so plan-duration metrics get a `node-update-tls` or `node-update-tls+image` label rather than mislabeling TLS-only plans as `init`. + +## 4. New task: `WaitForSidecarTLSSecret` + +```go +// internal/task/wait_for_sidecar_tls_secret.go + +const TaskTypeWaitForSidecarTLSSecret = "wait-for-sidecar-tls-secret" + +type WaitForSidecarTLSSecretParams struct { + NodeName string `json:"nodeName"` + Namespace string `json:"namespace"` +} + +// Execute polls until the cert-manager-managed Secret exists and carries +// non-empty tls.crt. Returns ErrTransient until ready; the executor +// retries with the standard backoff. Same task is reusable for future +// cert-rotation flows. +``` + +Idempotent. Cheap. Reads only — no mutations. Registered in `internal/task/task.go` deserializer map. + +## 5. New task: `ObserveSidecarTLS` + +Symmetric with `ObserveImage` (which stamps `status.currentImage`). Verifies the live pod's containers include `kube-rbac-proxy` when TLS is enabled, then stamps `status.currentSidecarTLS` to match `spec.sidecar.tls`. `handleTerminalPlan` clears `ConditionNodeUpdateInProgress` on plan completion (existing path — no change). + +```go +// internal/task/observe_sidecar_tls.go + +const TaskTypeObserveSidecarTLS = "observe-sidecar-tls" +``` + +Registered in `internal/task/task.go` deserializer map. + +## 6. Out of scope (deferred, tracked) + +- **Disable path** (`tls: set → nil`): adds `DeleteSidecarCert` + `DeleteRBACProxyConfig` cleanup tasks and an additional `sidecarTLSDrift` branch for the `spec=nil, current!=nil` case. ~30 LOC. Deferred: immediate prod use case is enable-only (arctic-1/atlantic-2/pacific-1 onto the new per-chain CAs). Risk: `spec.sidecar.tls = nil` on a TLS-enabled Running node is currently silently ignored (drift detector skips it); the Cert + ConfigMap persist until SeiNode delete cascades them via owner-ref. Tracked as follow-up issue. +- **Generalize drift detector to other `spec.sidecar` subfields** (Image, Port, Resources): today none triggers any drift handler. Adding them is a `tlsDrift`-style flag pattern — no plan-shape changes needed. +- **Issuer swap**: covered organically by §2 (`IssuerName != cur.IssuerName` triggers a NodeUpdate plan). Tested but not foregrounded. + +## 7. Cross-cutting (call out, not in this PR) + +- **SND fleet blast radius**: enabling TLS on an SND template triggers N concurrent SeiNode reconciles. The SND's existing `maxUnavailable` orchestration applies — this design doesn't change that surface. Confirm prod archive SND has appropriate maxUnavailable before fleet rollout. +- **cert-manager throughput**: N pods issuing certs from one CA Issuer is fine (per-chain CA Issuer is local, instant). Would matter for ACME, not for the design landed in platform#545. + +## 8. Verification + +- Unit: `sidecarTLSDrift` matrix over `(spec=nil/set, current=nil/set, issuer match/mismatch)`; spec-nil rows return false (disable deferred). +- Unit: plan ordering — `WaitForSidecarTLSSecret` between `ApplySidecarCert` and `ApplyStatefulSet`; `ObserveImage` / `ObserveSidecarTLS` correctly gated on flags. +- Unit: co-drift case — `buildNodeUpdatePlan(node, true, true)` emits cert pre-tasks, both observers, single pod cycle. +- Unit: condition `reason` discrimination across the three drift combinations. +- Unit: idempotency — re-entering the plan mid-flight after `ApplyStatefulSet` is safe (tasks are SSA). +- Unit: `classifyPlan` correctly labels TLS-only and TLS+image plans for metrics. +- envtest: full reconcile from spec edit → plan persisted → tasks executed → pod recreated with proxy container → `currentSidecarTLS` stamped → condition cleared. +- End-to-end: enable TLS on a harbor test chain's running SeiNode via spec edit; assert pod cycles cleanly and serves on `:8443`; assert subsequent reconciles are no-ops. \ No newline at end of file diff --git a/internal/planner/node_update_tls_test.go b/internal/planner/node_update_tls_test.go new file mode 100644 index 00000000..33ebd33d --- /dev/null +++ b/internal/planner/node_update_tls_test.go @@ -0,0 +1,260 @@ +package planner + +import ( + "testing" + + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" +) + +const ( + testIssuer = "sei-internal-ca" + testIssuerAlt = "sei-internal-ca-rotated" + testKind = "ClusterIssuer" +) + +func tlsSpecOf(issuer, kind string) *seiv1alpha1.SidecarTLSSpec { + return &seiv1alpha1.SidecarTLSSpec{IssuerName: issuer, IssuerKind: kind} +} + +func tlsStatus() *seiv1alpha1.SidecarTLSStatus { + return &seiv1alpha1.SidecarTLSStatus{IssuerName: testIssuer, IssuerKind: testKind} +} + +func withTLSSpec(node *seiv1alpha1.SeiNode, spec *seiv1alpha1.SidecarTLSSpec) *seiv1alpha1.SeiNode { + if node.Spec.Sidecar == nil { + node.Spec.Sidecar = &seiv1alpha1.SidecarConfig{} + } + node.Spec.Sidecar.TLS = spec + return node +} + +// --- sidecarTLSDrift matrix --- + +func TestSidecarTLSDrift_SpecNil_ReturnsFalse(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + // spec.sidecar.tls == nil, status.currentSidecarTLS == nil + g.Expect(sidecarTLSDrift(node)).To(BeFalse()) +} + +func TestSidecarTLSDrift_DisableCase_ReturnsFalse(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + // Deferred per LLD §6: spec=nil, current=set should NOT trigger drift. + node.Status.CurrentSidecarTLS = tlsStatus() + g.Expect(sidecarTLSDrift(node)).To(BeFalse(), + "disable path is deferred — spec.sidecar.tls=nil must not trigger drift") +} + +func TestSidecarTLSDrift_EnableCase_ReturnsTrue(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + // status.currentSidecarTLS == nil — initial enable + g.Expect(sidecarTLSDrift(node)).To(BeTrue()) +} + +func TestSidecarTLSDrift_IssuerSwap_ReturnsTrue(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuerAlt, testKind)) + node.Status.CurrentSidecarTLS = tlsStatus() + g.Expect(sidecarTLSDrift(node)).To(BeTrue()) +} + +func TestSidecarTLSDrift_KindSwap_ReturnsTrue(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, "Issuer")) + node.Status.CurrentSidecarTLS = tlsStatus() + g.Expect(sidecarTLSDrift(node)).To(BeTrue()) +} + +func TestSidecarTLSDrift_Converged_ReturnsFalse(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + node.Status.CurrentSidecarTLS = tlsStatus() + g.Expect(sidecarTLSDrift(node)).To(BeFalse()) +} + +// --- buildRunningPlan with TLS drift --- + +func TestBuildRunningPlan_TLSDriftOnly_ReturnsTLSTogglePlan(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + g.Expect(plan.Phase).To(Equal(seiv1alpha1.TaskPlanActive)) + g.Expect(plan.TargetPhase).To(Equal(seiv1alpha1.PhaseRunning)) + g.Expect(string(plan.FailedPhase)).To(BeEmpty()) + + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplySidecarCert, + task.TaskTypeWaitForSidecarTLSSecret, + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + })) + + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal("TLSToggleStarted")) + g.Expect(cond.Message).To(ContainSubstring("sidecar.tls drift detected")) + g.Expect(cond.Message).To(ContainSubstring("issuerName=" + testIssuer)) + g.Expect(cond.Message).To(ContainSubstring("issuerKind=" + testKind)) + g.Expect(cond.Message).To(ContainSubstring("current=none")) +} + +func TestBuildRunningPlan_ImageAndTLSDrift_SingleCombinedPlan(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + node.Spec.Image = testImageV2 // also bump image + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplySidecarCert, + task.TaskTypeWaitForSidecarTLSSecret, + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + }), "both observers run in one plan; one pod cycle covers both drifts") + + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Reason).To(Equal("UpdateAndTLSToggleStarted")) + g.Expect(cond.Message).To(ContainSubstring("image drift detected")) + g.Expect(cond.Message).To(ContainSubstring("sidecar.tls drift detected")) +} + +func TestBuildRunningPlan_ImageDrift_NoTLSPreTasks(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.Image = testImageV2 // image drift only, no TLS spec + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + // Existing image-only behavior preserved. + g.Expect(planTaskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) + + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond.Reason).To(Equal("UpdateStarted")) +} + +func TestBuildRunningPlan_TLSConverged_NoPlan(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + node.Status.CurrentSidecarTLS = tlsStatus() + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).To(BeNil(), "no drift → no plan") +} + +// --- nodeUpdateReason discrimination --- + +func TestNodeUpdateReason_AllCombinations(t *testing.T) { + g := NewWithT(t) + g.Expect(nodeUpdateReason(true, false)).To(Equal("UpdateStarted")) + g.Expect(nodeUpdateReason(false, true)).To(Equal("TLSToggleStarted")) + g.Expect(nodeUpdateReason(true, true)).To(Equal("UpdateAndTLSToggleStarted")) +} + +// --- classifyPlan labels --- + +func TestClassifyPlan_TLSToggle(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("tls-toggle")) +} + +func TestClassifyPlan_NodeUpdateWithTLS(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + node.Spec.Image = testImageV2 + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("node-update-tls")) +} + +func TestClassifyPlan_NodeUpdateImageOnly(t *testing.T) { + g := NewWithT(t) + node := runningFullNode() + node.Spec.Image = testImageV2 + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("node-update"), + "image-only plan keeps the existing label so dashboards do not break") +} + +// --- idempotency on plan rebuild --- + +func TestBuildRunningPlan_TLSDrift_UniqueIDs(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + + plan1, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + plan2, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(plan1.ID).NotTo(Equal(plan2.ID)) + seen := map[string]bool{} + for _, tsk := range plan1.Tasks { + g.Expect(seen[tsk.ID]).To(BeFalse(), "duplicate task ID: %s", tsk.ID) + seen[tsk.ID] = true + } +} + +// --- ResolvePlan condition lifecycle on TLS toggle --- + +func TestResolvePlan_TLSToggle_SetsAndClearsCondition(t *testing.T) { + g := NewWithT(t) + node := withTLSSpec(runningFullNode(), tlsSpecOf(testIssuer, testKind)) + + err := (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).NotTo(BeNil()) + + cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Reason).To(Equal("TLSToggleStarted")) + + // Simulate plan completion and observer stamping CurrentSidecarTLS. + node.Status.Plan.Phase = seiv1alpha1.TaskPlanComplete + node.Status.CurrentSidecarTLS = tlsStatus() + + err = (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).To(BeNil(), "completed plan should be cleared") + + cond = meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal("UpdateComplete")) +} diff --git a/internal/planner/planner.go b/internal/planner/planner.go index d6fe7456..0a1c00f4 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -206,16 +206,32 @@ func setNodeUpdateCondition(node *seiv1alpha1.SeiNode, status metav1.ConditionSt }) } -// classifyPlan returns the plan type for metrics. +// classifyPlan returns the plan type for metrics. NodeUpdate plans are +// labeled by which drift fired (image, tls, or both) so dashboards can +// split on the trigger without parsing condition reasons. func classifyPlan(plan *seiv1alpha1.TaskPlan) string { + var hasImage, hasTLS, hasInit bool for _, t := range plan.Tasks { switch t.Type { case task.TaskTypeObserveImage: - return "node-update" + hasImage = true + case task.TaskTypeObserveSidecarTLS: + hasTLS = true case task.TaskTypeEnsureDataPVC: - return "init" + hasInit = true } } + if hasInit { + return "init" + } + switch { + case hasImage && hasTLS: + return "node-update-tls" + case hasTLS: + return "tls-toggle" + case hasImage: + return "node-update" + } if len(plan.Tasks) == 1 && plan.Tasks[0].Type == sidecar.TaskTypeMarkReady { return "mark-ready-reapply" } @@ -576,10 +592,14 @@ func paramsForTaskType( return &task.ApplyRBACProxyConfigParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeApplySidecarCert: return &task.ApplySidecarCertParams{NodeName: node.Name, Namespace: node.Namespace} + case task.TaskTypeWaitForSidecarTLSSecret: + return &task.WaitForSidecarTLSSecretParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeReplacePod: return &task.ReplacePodParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeObserveImage: return &task.ObserveImageParams{NodeName: node.Name, Namespace: node.Namespace} + case task.TaskTypeObserveSidecarTLS: + return &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeValidateSigningKey: return validateSigningKeyParams(node) case task.TaskTypeValidateNodeKey: @@ -703,11 +723,13 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { } // buildRunningPlan returns a steady-state drift plan, or nil if no drift. -// Image drift is checked first — its plan ends with MarkReady, so it also -// resolves any stale sidecar. +// Image and TLS drift co-exist in one NodeUpdate plan so a single pod +// cycle resolves both. func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if node.Spec.Image != node.Status.CurrentImage { - return buildNodeUpdatePlan(node) + imageDrift := node.Spec.Image != node.Status.CurrentImage + tlsDrift := sidecarTLSDrift(node) + if imageDrift || tlsDrift { + return buildNodeUpdatePlan(node, imageDrift, tlsDrift) } if sidecarNeedsReapproval(node) { return buildMarkReadyPlan(node) @@ -715,6 +737,25 @@ func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) return nil, nil } +// sidecarTLSDrift reports whether spec.sidecar.tls and +// status.currentSidecarTLS disagree on enablement or Issuer. +// +// Enable-only: returns true when spec is set and current is nil or has +// mismatched issuer. spec=nil + current!=nil (disable) returns false +// here — disabling TLS on a Running node is silently ignored until the +// disable-path tasks land. See docs/design-seinode-sidecar-tls-toggle-lld.md §6. +func sidecarTLSDrift(node *seiv1alpha1.SeiNode) bool { + if node.Spec.Sidecar == nil || node.Spec.Sidecar.TLS == nil { + return false + } + spec := node.Spec.Sidecar.TLS + cur := node.Status.CurrentSidecarTLS + if cur == nil { + return true + } + return spec.IssuerName != cur.IssuerName || spec.IssuerKind != cur.IssuerKind +} + func sidecarNeedsReapproval(node *seiv1alpha1.SeiNode) bool { cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == "NotReady" @@ -734,23 +775,48 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// buildNodeUpdatePlan constructs a plan to roll out an image update on a -// Running node. The plan applies the new StatefulSet spec, waits for the -// rollout to complete, then re-initializes the sidecar. +// buildNodeUpdatePlan constructs a plan to roll out an image and/or +// sidecar TLS change on a Running node. The plan optionally provisions +// the kube-rbac-proxy Certificate + ConfigMap (TLS drift), applies the +// regenerated StatefulSet spec, cycles the pod, observes whichever +// fields drifted, then re-initializes the sidecar. At least one of +// imageDrift or tlsDrift must be true. +// +// Task order is load-bearing: ApplyService runs before ReplacePod so +// the headless Service advertises :8443 by the time the new pod with +// kube-rbac-proxy is endpoint-eligible. The reverse order would create +// a window where the Service exposes a port the live pod doesn't bind. // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. -func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", - fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) - - prog := []string{ +func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode, imageDrift, tlsDrift bool) (*seiv1alpha1.TaskPlan, error) { + setNodeUpdateCondition(node, metav1.ConditionTrue, + nodeUpdateReason(imageDrift, tlsDrift), + nodeUpdateMessage(node, imageDrift, tlsDrift)) + + var prog []string + if tlsDrift && noderesource.SidecarTLSEnabled(node) { + // Emit Cert + ConfigMap before the StatefulSet apply so the + // pod schedules with a populated Secret. cert-manager issuance + // is async; the wait task gates StatefulSet apply on it. + prog = append(prog, + task.TaskTypeApplySidecarCert, + task.TaskTypeWaitForSidecarTLSSecret, + task.TaskTypeApplyRBACProxyConfig, + ) + } + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, task.TaskTypeReplacePod, - task.TaskTypeObserveImage, - sidecar.TaskTypeMarkReady, + ) + if imageDrift { + prog = append(prog, task.TaskTypeObserveImage) + } + if tlsDrift { + prog = append(prog, task.TaskTypeObserveSidecarTLS) } + prog = append(prog, sidecar.TaskTypeMarkReady) planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) @@ -769,6 +835,63 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro }, nil } +// Condition reasons emitted for NodeUpdateInProgress. Discriminates the +// trigger so dashboards keyed on the condition can split image rollout, +// TLS toggle, and co-drift without parsing the message. +const ( + reasonUpdateStarted = "UpdateStarted" + reasonTLSToggleStarted = "TLSToggleStarted" + reasonUpdateAndTLSToggleStarted = "UpdateAndTLSToggleStarted" +) + +// nodeUpdateReason returns the condition reason for a NodeUpdate plan, +// discriminating between image drift, TLS drift, and the co-drift case. +func nodeUpdateReason(imageDrift, tlsDrift bool) string { + switch { + case imageDrift && tlsDrift: + return reasonUpdateAndTLSToggleStarted + case tlsDrift: + return reasonTLSToggleStarted + default: + return reasonUpdateStarted + } +} + +func nodeUpdateMessage(node *seiv1alpha1.SeiNode, imageDrift, tlsDrift bool) string { + switch { + case imageDrift && tlsDrift: + return fmt.Sprintf("image drift detected: spec=%s current=%s; sidecar.tls drift detected: spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage, + formatTLSSpec(node.Spec.Sidecar.TLS), + formatTLSStatus(node.Status.CurrentSidecarTLS)) + case tlsDrift: + return fmt.Sprintf("sidecar.tls drift detected: spec=%s current=%s", + formatTLSSpec(node.Spec.Sidecar.TLS), + formatTLSStatus(node.Status.CurrentSidecarTLS)) + default: + return fmt.Sprintf("image drift detected: spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage) + } +} + +// formatTLSSpec renders a *SidecarTLSSpec for operator-readable +// condition messages. Avoids %v's Go-struct repr ("&{ca ClusterIssuer}"). +func formatTLSSpec(s *seiv1alpha1.SidecarTLSSpec) string { + if s == nil { + return "none" + } + return fmt.Sprintf("{issuerName=%s issuerKind=%s}", s.IssuerName, s.IssuerKind) +} + +// formatTLSStatus renders a *SidecarTLSStatus for operator-readable +// condition messages. +func formatTLSStatus(s *seiv1alpha1.SidecarTLSStatus) string { + if s == nil { + return "none" + } + return fmt.Sprintf("{issuerName=%s issuerKind=%s}", s.IssuerName, s.IssuerKind) +} + // mergeOverrides combines controller-generated overrides with user-specified // overrides. User overrides take precedence. func mergeOverrides(controllerOverrides, userOverrides map[string]string) map[string]string { diff --git a/internal/task/observe_sidecar_tls.go b/internal/task/observe_sidecar_tls.go new file mode 100644 index 00000000..6b978abe --- /dev/null +++ b/internal/task/observe_sidecar_tls.go @@ -0,0 +1,98 @@ +package task + +import ( + "context" + "encoding/json" + "fmt" + + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +const TaskTypeObserveSidecarTLS = "observe-sidecar-tls" + +type ObserveSidecarTLSParams struct { + NodeName string `json:"nodeName"` + Namespace string `json:"namespace"` +} + +type observeSidecarTLSExecution struct { + taskBase + params ObserveSidecarTLSParams + cfg ExecutionConfig +} + +func deserializeObserveSidecarTLS(id string, params json.RawMessage, cfg ExecutionConfig) (TaskExecution, error) { + var p ObserveSidecarTLSParams + if len(params) > 0 { + if err := json.Unmarshal(params, &p); err != nil { + return nil, fmt.Errorf("deserializing observe-sidecar-tls params: %w", err) + } + } + return &observeSidecarTLSExecution{ + taskBase: taskBase{id: id, status: ExecutionRunning}, + params: p, + cfg: cfg, + }, nil +} + +// Execute polls the StatefulSet rollout. Once converged, stamps +// status.currentSidecarTLS to mirror spec.sidecar.tls so the planner's +// drift detector observes the toggle as applied. Mirrors ObserveImage: +// the StatefulSet template is the source of truth for pod composition, +// so rollout convergence implies the live pod matches. +// +// Why rollout convergence is sufficient on a TLS-only toggle: enabling +// (or rotating the Issuer for) TLS changes the pod template — new +// Secret volume, new kube-rbac-proxy container, new :8443 port. The +// template change bumps the StatefulSet's controller-revision hash, +// so Status.UpdatedReplicas dips below *Spec.Replicas until the new +// revision rolls. The check thus reliably gates on the new template +// being live, same as ObserveImage gates on an image-change rollout. +func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { + node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) + if err != nil { + return Terminal(err) + } + + sts := &appsv1.StatefulSet{} + key := types.NamespacedName{Name: node.Name, Namespace: node.Namespace} + if err := e.cfg.APIReader.Get(ctx, key, sts); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("getting statefulset: %w", err) + } + + if sts.Status.ObservedGeneration < sts.Generation { + return nil + } + if sts.Spec.Replicas == nil || sts.Status.UpdatedReplicas < *sts.Spec.Replicas { + return nil + } + + node.Status.CurrentSidecarTLS = sidecarTLSStatusFromSpec(node) + e.complete() + return nil +} + +func (e *observeSidecarTLSExecution) Status(_ context.Context) ExecutionStatus { + return e.DefaultStatus() +} + +// sidecarTLSStatusFromSpec returns the SidecarTLSStatus mirroring the +// node's spec.sidecar.tls, or nil if TLS is not enabled. Centralizes +// the spec → status projection so the observer and any future caller +// agree on the shape. +func sidecarTLSStatusFromSpec(node *seiv1alpha1.SeiNode) *seiv1alpha1.SidecarTLSStatus { + if node.Spec.Sidecar == nil || node.Spec.Sidecar.TLS == nil { + return nil + } + return &seiv1alpha1.SidecarTLSStatus{ + IssuerName: node.Spec.Sidecar.TLS.IssuerName, + IssuerKind: node.Spec.Sidecar.TLS.IssuerKind, + } +} diff --git a/internal/task/observe_sidecar_tls_test.go b/internal/task/observe_sidecar_tls_test.go new file mode 100644 index 00000000..3d30ddc4 --- /dev/null +++ b/internal/task/observe_sidecar_tls_test.go @@ -0,0 +1,163 @@ +package task + +import ( + "context" + "encoding/json" + "testing" + + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +// testClusterIssuerKind is the kube-rbac-proxy default IssuerKind used +// across the task-level TLS tests in this package. +const testClusterIssuerKind = "ClusterIssuer" + +func observeTLSNode(withTLS bool) *seiv1alpha1.SeiNode { + n := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testReplaceSTS, Namespace: testReplaceNs}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: opkChainID, + Image: opkImage, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + Status: seiv1alpha1.SeiNodeStatus{Phase: seiv1alpha1.PhaseRunning}, + } + if withTLS { + n.Spec.Sidecar = &seiv1alpha1.SidecarConfig{ + TLS: &seiv1alpha1.SidecarTLSSpec{IssuerName: "ca", IssuerKind: testClusterIssuerKind}, + } + } + return n +} + +func observeTLSCfg(t *testing.T, node *seiv1alpha1.SeiNode, sts *appsv1.StatefulSet) ExecutionConfig { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatal(err) + } + if err := seiv1alpha1.AddToScheme(s); err != nil { + t.Fatal(err) + } + builder := fake.NewClientBuilder().WithScheme(s) + if sts != nil { + builder = builder.WithObjects(sts) + } + c := builder.Build() + return ExecutionConfig{ + KubeClient: c, + APIReader: c, + Scheme: s, + Resource: node, + } +} + +func newObserveTLSExec(t *testing.T, cfg ExecutionConfig) TaskExecution { + t.Helper() + params := ObserveSidecarTLSParams{NodeName: testReplaceSTS, Namespace: testReplaceNs} + raw, _ := json.Marshal(params) + exec, err := deserializeObserveSidecarTLS("obs-tls-1", raw, cfg) + if err != nil { + t.Fatal(err) + } + return exec +} + +func TestObserveSidecarTLS_RolloutComplete_StampsStatus(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode(true) + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionComplete)) + g.Expect(node.Status.CurrentSidecarTLS).NotTo(BeNil()) + g.Expect(node.Status.CurrentSidecarTLS.IssuerName).To(Equal("ca")) + g.Expect(node.Status.CurrentSidecarTLS.IssuerKind).To(Equal("ClusterIssuer")) +} + +func TestObserveSidecarTLS_RollingUpdate_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode(true) + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 0, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLS).To(BeNil(), "status must not be stamped before rollout converges") +} + +func TestObserveSidecarTLS_StaleGeneration_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode(true) + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 3}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLS).To(BeNil()) +} + +func TestObserveSidecarTLS_StatefulSetNotFound_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode(true) + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestObserveSidecarTLS_SpecNilOnConverge_StampsNil(t *testing.T) { + // Forward-compat: if a future disable-path plan reaches this observer + // with spec.sidecar.tls == nil, the observer must stamp nil, not a + // stale issuer. Exercises the helper. + g := NewWithT(t) + node := observeTLSNode(false) // no TLS spec + // Seed a stale status to verify it gets cleared. + node.Status.CurrentSidecarTLS = &seiv1alpha1.SidecarTLSStatus{IssuerName: "stale", IssuerKind: "ClusterIssuer"} + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionComplete)) + g.Expect(node.Status.CurrentSidecarTLS).To(BeNil(), + "observer must mirror spec — nil spec stamps nil status") +} + +func TestObserveSidecarTLS_DeserializeEmptyParams(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode(true) + cfg := observeTLSCfg(t, node, nil) + + exec, err := deserializeObserveSidecarTLS("obs-empty", nil, cfg) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(exec).NotTo(BeNil()) +} diff --git a/internal/task/task.go b/internal/task/task.go index cc03c27a..69020762 100644 --- a/internal/task/task.go +++ b/internal/task/task.go @@ -220,8 +220,10 @@ var registry = map[string]taskDeserializer{ TaskTypeApplyService: deserializeApplyService, TaskTypeApplyRBACProxyConfig: deserializeApplyRBACProxyConfig, TaskTypeApplySidecarCert: deserializeApplySidecarCert, + TaskTypeWaitForSidecarTLSSecret: deserializeWaitForSidecarTLSSecret, TaskTypeReplacePod: deserializeReplacePod, TaskTypeObserveImage: deserializeObserveImage, + TaskTypeObserveSidecarTLS: deserializeObserveSidecarTLS, TaskTypeValidateSigningKey: deserializeValidateSigningKey, TaskTypeValidateNodeKey: deserializeValidateNodeKey, TaskTypeValidateOperatorKeyring: deserializeValidateOperatorKeyring, diff --git a/internal/task/wait_for_sidecar_tls_secret.go b/internal/task/wait_for_sidecar_tls_secret.go new file mode 100644 index 00000000..65fae8bf --- /dev/null +++ b/internal/task/wait_for_sidecar_tls_secret.go @@ -0,0 +1,121 @@ +package task + +import ( + "context" + "encoding/json" + "fmt" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" +) + +const TaskTypeWaitForSidecarTLSSecret = "wait-for-sidecar-tls-secret" + +// certManagerFailedReason is the terminal failure reason on a +// Certificate's Ready condition. Other False reasons (Issuing, etc.) +// are transient and the task continues to poll. +const certManagerFailedReason = "Failed" + +type WaitForSidecarTLSSecretParams struct { + NodeName string `json:"nodeName"` + Namespace string `json:"namespace"` +} + +type waitForSidecarTLSSecretExecution struct { + taskBase + params WaitForSidecarTLSSecretParams + cfg ExecutionConfig +} + +func deserializeWaitForSidecarTLSSecret(id string, params json.RawMessage, cfg ExecutionConfig) (TaskExecution, error) { + var p WaitForSidecarTLSSecretParams + if len(params) > 0 { + if err := json.Unmarshal(params, &p); err != nil { + return nil, fmt.Errorf("deserializing wait-for-sidecar-tls-secret params: %w", err) + } + } + return &waitForSidecarTLSSecretExecution{ + taskBase: taskBase{id: id, status: ExecutionRunning}, + params: p, + cfg: cfg, + }, nil +} + +// Execute polls the cert-manager-managed Secret for populated tls.crt +// AND tls.key. Returns nil while either is absent so the task stays +// Running and the executor re-invokes on the next reconcile. Bypasses +// the cache because the Secret was just requested by the preceding +// apply-sidecar-cert task. +// +// Operator-visible failure surface: if the underlying Certificate's +// Ready condition reports terminal failure (e.g., the referenced +// Issuer/ClusterIssuer doesn't exist), the task returns Terminal so +// the plan fails fast with UpdateFailed rather than polling forever. +func (e *waitForSidecarTLSSecretExecution) Execute(ctx context.Context) error { + node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) + if err != nil { + return Terminal(err) + } + + sec := &corev1.Secret{} + key := types.NamespacedName{Name: noderesource.SidecarTLSSecretName(node), Namespace: node.Namespace} + if err := e.cfg.APIReader.Get(ctx, key, sec); err != nil { + if !apierrors.IsNotFound(err) { + return fmt.Errorf("getting sidecar TLS Secret: %w", err) + } + // Secret absent — check Certificate for terminal failure. + return e.checkCertificateFailure(ctx, node) + } + if len(sec.Data[corev1.TLSCertKey]) == 0 || len(sec.Data[corev1.TLSPrivateKeyKey]) == 0 { + return e.checkCertificateFailure(ctx, node) + } + + e.complete() + return nil +} + +// checkCertificateFailure reads the cert-manager Certificate and +// returns Terminal if its Ready condition is False with Reason=Failed. +// All other states return nil — the task continues polling. +func (e *waitForSidecarTLSSecretExecution) checkCertificateFailure(ctx context.Context, node *seiv1alpha1.SeiNode) error { + cert := &unstructured.Unstructured{} + cert.SetAPIVersion("cert-manager.io/v1") + cert.SetKind("Certificate") + key := types.NamespacedName{Name: noderesource.SidecarTLSSecretName(node), Namespace: node.Namespace} + if err := e.cfg.APIReader.Get(ctx, key, cert); err != nil { + // Certificate missing (transient: not yet applied or being + // deleted) or any other read error: do not fail the plan. + return nil + } + conditions, found, err := unstructured.NestedSlice(cert.Object, "status", "conditions") + if err != nil || !found { + return nil + } + for _, raw := range conditions { + cond, ok := raw.(map[string]any) + if !ok { + continue + } + if t, _ := cond["type"].(string); t != "Ready" { + continue + } + status, _ := cond["status"].(string) + reason, _ := cond["reason"].(string) + message, _ := cond["message"].(string) + if status == "False" && reason == certManagerFailedReason { + return Terminal(fmt.Errorf("cert-manager Certificate %s/%s Failed: %s", + key.Namespace, key.Name, message)) + } + return nil + } + return nil +} + +func (e *waitForSidecarTLSSecretExecution) Status(_ context.Context) ExecutionStatus { + return e.DefaultStatus() +} diff --git a/internal/task/wait_for_sidecar_tls_secret_test.go b/internal/task/wait_for_sidecar_tls_secret_test.go new file mode 100644 index 00000000..feb5e68a --- /dev/null +++ b/internal/task/wait_for_sidecar_tls_secret_test.go @@ -0,0 +1,218 @@ +package task + +import ( + "context" + "encoding/json" + "errors" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" +) + +func waitTLSNode() *seiv1alpha1.SeiNode { + return &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testReplaceSTS, Namespace: testReplaceNs}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: opkChainID, + Image: opkImage, + FullNode: &seiv1alpha1.FullNodeSpec{}, + Sidecar: &seiv1alpha1.SidecarConfig{ + TLS: &seiv1alpha1.SidecarTLSSpec{IssuerName: "ca", IssuerKind: testClusterIssuerKind}, + }, + }, + } +} + +// waitTLSCfg builds an ExecutionConfig with an optional Secret and +// optional cert-manager Certificate (unstructured) seeded into the +// fake client. +func waitTLSCfg(t *testing.T, node *seiv1alpha1.SeiNode, sec *corev1.Secret, cert *unstructured.Unstructured) ExecutionConfig { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatal(err) + } + if err := seiv1alpha1.AddToScheme(s); err != nil { + t.Fatal(err) + } + builder := fake.NewClientBuilder().WithScheme(s) + objs := []client.Object{} + if sec != nil { + objs = append(objs, sec) + } + if cert != nil { + objs = append(objs, cert) + } + if len(objs) > 0 { + builder = builder.WithObjects(objs...) + } + c := builder.Build() + return ExecutionConfig{ + KubeClient: c, + APIReader: c, + Scheme: s, + Resource: node, + } +} + +func newWaitTLSExec(t *testing.T, cfg ExecutionConfig) TaskExecution { + t.Helper() + params := WaitForSidecarTLSSecretParams{NodeName: testReplaceSTS, Namespace: testReplaceNs} + raw, _ := json.Marshal(params) + exec, err := deserializeWaitForSidecarTLSSecret("wait-tls-1", raw, cfg) + if err != nil { + t.Fatal(err) + } + return exec +} + +// certManagerCertificate constructs an unstructured cert-manager +// Certificate with the given Ready condition. +func certManagerCertificate(name, namespace, status, reason, message string) *unstructured.Unstructured { + cert := &unstructured.Unstructured{} + cert.SetAPIVersion("cert-manager.io/v1") + cert.SetKind("Certificate") + cert.SetName(name) + cert.SetNamespace(namespace) + _ = unstructured.SetNestedSlice(cert.Object, []any{ + map[string]any{ + "type": "Ready", + "status": status, + "reason": reason, + "message": message, + }, + }, "status", "conditions") + return cert +} + +func TestWaitForSidecarTLSSecret_SecretReady_Completes(t *testing.T) { + g := NewWithT(t) + node := waitTLSNode() + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: noderesource.SidecarTLSSecretName(node), + Namespace: node.Namespace, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: []byte("-----BEGIN CERTIFICATE-----\n..."), + corev1.TLSPrivateKeyKey: []byte("-----BEGIN PRIVATE KEY-----\n..."), + }, + } + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, sec, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionComplete)) +} + +func TestWaitForSidecarTLSSecret_SecretMissing_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := waitTLSNode() + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, nil, nil)) + + // cert-manager hasn't issued yet — Secret absent, Certificate absent. + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestWaitForSidecarTLSSecret_TLSCertEmpty_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := waitTLSNode() + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: noderesource.SidecarTLSSecretName(node), + Namespace: node.Namespace, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: []byte(""), + corev1.TLSPrivateKeyKey: []byte("k"), + }, + } + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, sec, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestWaitForSidecarTLSSecret_TLSKeyEmpty_StaysRunning(t *testing.T) { + // Defensive: a Secret with tls.crt but no tls.key would crash-loop + // the proxy at boot. Both must be present. + g := NewWithT(t) + node := waitTLSNode() + sec := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: noderesource.SidecarTLSSecretName(node), + Namespace: node.Namespace, + }, + Type: corev1.SecretTypeTLS, + Data: map[string][]byte{ + corev1.TLSCertKey: []byte("-----BEGIN CERTIFICATE-----\n..."), + }, + } + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, sec, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestWaitForSidecarTLSSecret_CertificateFailed_ReturnsTerminal(t *testing.T) { + g := NewWithT(t) + node := waitTLSNode() + cert := certManagerCertificate( + noderesource.SidecarTLSSecretName(node), + node.Namespace, + "False", "Failed", + "referenced ClusterIssuer not found", + ) + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, nil, cert)) + + err := exec.Execute(context.Background()) + g.Expect(err).To(HaveOccurred()) + var te *TerminalError + g.Expect(errors.As(err, &te)).To(BeTrue(), "must be a TerminalError so the plan fails fast") + g.Expect(err.Error()).To(ContainSubstring("cert-manager Certificate")) + g.Expect(err.Error()).To(ContainSubstring("referenced ClusterIssuer not found")) +} + +func TestWaitForSidecarTLSSecret_CertificateIssuing_StaysRunning(t *testing.T) { + // Transient False reasons (Issuing, etc.) must NOT terminal the plan. + g := NewWithT(t) + node := waitTLSNode() + cert := certManagerCertificate( + noderesource.SidecarTLSSecretName(node), + node.Namespace, + "False", "Issuing", + "issuance in progress", + ) + + exec := newWaitTLSExec(t, waitTLSCfg(t, node, nil, cert)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestWaitForSidecarTLSSecret_DeserializeEmptyParams(t *testing.T) { + g := NewWithT(t) + node := waitTLSNode() + cfg := waitTLSCfg(t, node, nil, nil) + + exec, err := deserializeWaitForSidecarTLSSecret("wait-empty", nil, cfg) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(exec).NotTo(BeNil()) +} diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 329ad8db..e2f7be4d 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -612,8 +612,9 @@ spec: tls: description: |- TLS, if set, fronts the sidecar API with kube-rbac-proxy on - :8443 backed by a cert-manager-issued cert. Init-only: - toggling on a Running SeiNode requires recreation. See seictl#165. + :8443 backed by a cert-manager-issued cert. Toggling on a + Running SeiNode triggers a NodeUpdate plan that provisions + the Certificate + ConfigMap and cycles the pod. properties: issuerKind: default: ClusterIssuer diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index dad2a1cb..1ee879b7 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -467,8 +467,9 @@ spec: tls: description: |- TLS, if set, fronts the sidecar API with kube-rbac-proxy on - :8443 backed by a cert-manager-issued cert. Init-only: - toggling on a Running SeiNode requires recreation. See seictl#165. + :8443 backed by a cert-manager-issued cert. Toggling on a + Running SeiNode triggers a NodeUpdate plan that provisions + the Certificate + ConfigMap and cycles the pod. properties: issuerKind: default: ClusterIssuer @@ -839,6 +840,25 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLS: + description: |- + CurrentSidecarTLS captures the sidecar TLS configuration observed + running on the owned StatefulSet. Stamped by the ObserveSidecarTLS + task when a TLS-toggle NodeUpdate plan completes. Compared against + spec.sidecar.tls to detect drift. Nil means TLS is not enabled on + the running pod. + properties: + issuerKind: + enum: + - Issuer + - ClusterIssuer + type: string + issuerName: + type: string + required: + - issuerKind + - issuerName + type: object externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node,