From 77a28c1a17e432c45f10ddf2d09ee9194e16a242 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Sun, 17 May 2026 18:14:08 -0700 Subject: [PATCH 1/3] feat(seinode): in-place TLS enable on Running SeiNodes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on #258's externalized-cert foundation. Operators can now add spec.sidecar.tls.secretName to an existing Running SeiNode; the controller composes a NodeUpdate-style plan that cycles the pod with the proxy container mount. Disable and swap remain CEL-blocked. API: - Relax spec.sidecar.tls CEL: allow nil → set transition, still block set → nil and set → different. Set-once semantics; immutability is only loosened in the direction that has a coherent rollout path. - Add Status.CurrentSidecarTLSSecretName: live transport-mode source of truth, distinct from the platform-tooling contract published in Status.SidecarTLS. Transport-mode consistency: - SidecarURLForNode now reads from Status.CurrentSidecarTLSSecretName rather than Spec.Sidecar.TLS. Mid-rollout (spec set, pod not yet cycled), the controller's sidecar client keeps using HTTP until ObserveSidecarTLS stamps the status field — this is the windowing bug that bit PR #254. Test covers the mid-rollout state explicitly. Planner: - sidecarTLSEnableDrift: narrow detector (spec.tls set + status CurrentSidecarTLSSecretName empty + preflight Ready). - buildRunningPlan composes image-drift + TLS-enable drift into one pod cycle. - buildNodeUpdatePlan emits ApplyRBACProxyConfig pre-task + the ObserveSidecarTLS observer when TLS-enable drift fires; co-composes with ObserveImage when both drift. - Init plans (buildBasePlan + buildBootstrapPlan + buildGenesisPlan) inject ObserveSidecarTLS between ApplyStatefulSet/Service and the first sidecar HTTP task so the transport-mode flip happens before ConfigApply / GenerateIdentity / etc. fire. Task: - New ObserveSidecarTLS task. Polls the StatefulSet for rollout convergence (mirrors ObserveImage), then stamps Status.CurrentSidecarTLSSecretName from spec. - Registered in task.go deserializer + paramsForTaskType. classifyPlan emits new labels: tls-enable, node-update+tls-enable. Tests: - Drift detector matrix (fires / converged / not-ready / disabled). - Plan composition across (image-only, tls-only, both). - Init plan ObserveSidecarTLS placement before the first sidecar HTTP task (validates the transport-mode-flip ordering invariant). - SidecarURLForNode mid-rollout regression: spec-set + status-empty keeps HTTP. - Condition lifecycle on TLS-enable plans. Closes #262. Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1alpha1/seinode_types.go | 9 +- config/crd/sei.io_seinodedeployments.yaml | 9 +- config/crd/sei.io_seinodes.yaml | 17 +- internal/noderesource/noderesource.go | 12 +- internal/planner/bootstrap.go | 15 ++ internal/planner/node_update_test.go | 2 +- internal/planner/planner.go | 93 ++++++-- internal/planner/sidecar_tls_enable_test.go | 239 ++++++++++++++++++++ internal/planner/sidecar_tls_test.go | 20 ++ internal/task/observe_sidecar_tls.go | 81 +++++++ internal/task/observe_sidecar_tls_test.go | 120 ++++++++++ internal/task/task.go | 1 + manifests/sei.io_seinodedeployments.yaml | 9 +- manifests/sei.io_seinodes.yaml | 17 +- 14 files changed, 605 insertions(+), 39 deletions(-) create mode 100644 internal/planner/sidecar_tls_enable_test.go create mode 100644 internal/task/observe_sidecar_tls.go create mode 100644 internal/task/observe_sidecar_tls_test.go diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index e9444c7..8cc8c82 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -10,7 +10,7 @@ import ( // the populated field determines the node's operating mode. // +kubebuilder:validation:XValidation:rule="(has(self.fullNode) ? 1 : 0) + (has(self.archive) ? 1 : 0) + (has(self.replayer) ? 1 : 0) + (has(self.validator) ? 1 : 0) == 1",message="exactly one of fullNode, archive, replayer, or validator must be set" // +kubebuilder:validation:XValidation:rule="!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)",message="peers is required when replayer mode is set" -// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is immutable; delete + recreate the SeiNode to change TLS configuration" +// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is set-once: enabling on an existing SeiNode is allowed, but disabling or changing requires delete + recreate" type SeiNodeSpec struct { // ChainID of the chain this node belongs to. // +kubebuilder:validation:MinLength=1 @@ -362,6 +362,13 @@ type SeiNodeStatus struct { // SidecarTLS is set when spec.sidecar.tls is non-nil. // +optional SidecarTLS *SidecarTLSStatus `json:"sidecarTLS,omitempty"` + + // CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName + // observed on the live pod after the rollout converged. Drives the + // controller's sidecar client transport-mode selection (HTTP vs + // HTTPS). Empty when no TLS rollout has converged yet. + // +optional + CurrentSidecarTLSSecretName string `json:"currentSidecarTLSSecretName,omitempty"` } // SidecarTLSStatus is the controller's view of the referenced TLS Secret. diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 3f564f2..7760100 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -895,11 +895,12 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the - SeiNode to change TLS configuration + - message: 'spec.sidecar.tls is set-once: enabling on an existing + SeiNode is allowed, but disabling or changing requires delete + + recreate' rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) - ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) - && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)' + ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls + == oldSelf.sidecar.tls)' required: - spec type: object diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 421e1c4..dd848aa 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -766,11 +766,11 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the SeiNode - to change TLS configuration - rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) - || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) - && self.sidecar.tls == oldSelf.sidecar.tls)' + - message: 'spec.sidecar.tls is set-once: enabling on an existing SeiNode + is allowed, but disabling or changing requires delete + recreate' + rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : + (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == + oldSelf.sidecar.tls)' status: description: SeiNodeStatus defines the observed state of a SeiNode. properties: @@ -841,6 +841,13 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLSSecretName: + description: |- + CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName + observed on the live pod after the rollout converged. Drives the + controller's sidecar client transport-mode selection (HTTP vs + HTTPS). Empty when no TLS rollout has converged yet. + type: string externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node, diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 17f502a..30913c4 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -1156,14 +1156,14 @@ func GenerateRBACProxyConfigMap(node *seiv1alpha1.SeiNode) *corev1.ConfigMap { } // SidecarURLForNode builds the in-cluster URL the controller uses to -// reach a node's sidecar. TLS-enabled nodes route through the -// kube-rbac-proxy on HTTPS :8443; otherwise plain HTTP on the -// sidecar's own port. Single source of truth — both the planner -// (for plan-execution clients) and deployment-await tasks call -// through here. +// reach a node's sidecar. Routes through kube-rbac-proxy on HTTPS :8443 +// when the live pod is serving TLS; otherwise plain HTTP on the sidecar +// port. Reads the OBSERVED state (Status.CurrentSidecarTLSSecretName), +// not spec, so a mid-rollout SeiNode (spec.tls just set, pod not yet +// cycled) keeps using HTTP until the new pod is up. func SidecarURLForNode(node *seiv1alpha1.SeiNode) string { scheme, port := "http", SidecarPort(node) - if SidecarTLSEnabled(node) { + if node.Status.CurrentSidecarTLSSecretName != "" { scheme, port = "https", RBACProxyPort } return fmt.Sprintf("%s://%s-0.%s.%s.svc.cluster.local:%d", diff --git a/internal/planner/bootstrap.go b/internal/planner/bootstrap.go index be1853b..4da0122 100644 --- a/internal/planner/bootstrap.go +++ b/internal/planner/bootstrap.go @@ -112,6 +112,14 @@ func buildBootstrapPlan( &task.ApplyServiceParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { return nil, err } + if noderesource.SidecarTLSEnabled(node) { + // Inject before sidecar HTTP tasks so SidecarURLForNode picks up + // TLS transport (via status.currentSidecarTLSSecretName) first. + if err := appendTask(task.TaskTypeObserveSidecarTLS, + &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { + return nil, err + } + } // Phase 5: Post-bootstrap config on StatefulSet pod for _, taskType := range postProg { @@ -188,6 +196,13 @@ func buildGenesisPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, + ) + if noderesource.SidecarTLSEnabled(node) { + // Inject before sidecar HTTP tasks so SidecarURLForNode picks up + // TLS transport before the first sidecar call. + prog = append(prog, task.TaskTypeObserveSidecarTLS) + } + prog = append(prog, TaskGenerateIdentity, TaskGenerateGentx, TaskUploadGenesisArtifacts, diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 333bc66..fa04268 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -97,7 +97,7 @@ func TestResolvePlan_NodeUpdate_SetsCondition(t *testing.T) { g.Expect(cond).NotTo(BeNil(), "NodeUpdateInProgress condition should be set") g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) g.Expect(cond.Reason).To(Equal("UpdateStarted")) - g.Expect(cond.Message).To(ContainSubstring("image drift detected")) + g.Expect(cond.Message).To(ContainSubstring("image drift")) } func TestResolvePlan_CompletedPlan_ClearsCondition(t *testing.T) { diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 08a7b25..ccbc120 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -220,14 +220,28 @@ func setNodeUpdateCondition(node *seiv1alpha1.SeiNode, status metav1.ConditionSt // classifyPlan returns the plan type for metrics. func classifyPlan(plan *seiv1alpha1.TaskPlan) string { + var hasImage, hasTLS, hasInit bool for _, t := range plan.Tasks { switch t.Type { case task.TaskTypeObserveImage: - return "node-update" + hasImage = true + case task.TaskTypeObserveSidecarTLS: + hasTLS = true case task.TaskTypeEnsureDataPVC: - return "init" + hasInit = true } } + if hasInit { + return "init" + } + switch { + case hasImage && hasTLS: + return "node-update+tls-enable" + case hasImage: + return "node-update" + case hasTLS: + return "tls-enable" + } if len(plan.Tasks) == 1 && plan.Tasks[0].Type == sidecar.TaskTypeMarkReady { return "mark-ready-reapply" } @@ -544,6 +558,13 @@ func buildBasePlan( prog = append(prog, task.TaskTypeApplyRBACProxyConfig) } prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) + if noderesource.SidecarTLSEnabled(node) { + // Insert before the sidecar HTTP progression so SidecarURLForNode + // picks up the TLS transport mode (via the in-memory mutation of + // status.currentSidecarTLSSecretName) before the first sidecar + // HTTP call (SnapshotRestore / ConfigApply / MarkReady) fires. + prog = append(prog, task.TaskTypeObserveSidecarTLS) + } prog = append(prog, sidecarProg...) planID := uuid.New().String() @@ -588,6 +609,8 @@ func paramsForTaskType( return &task.ReplacePodParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeObserveImage: return &task.ObserveImageParams{NodeName: node.Name, Namespace: node.Namespace} + case task.TaskTypeObserveSidecarTLS: + return &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeValidateSigningKey: return validateSigningKeyParams(node) case task.TaskTypeValidateNodeKey: @@ -711,10 +734,10 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { } // buildRunningPlan returns a steady-state drift plan, or nil if no drift. -// Image drift is checked first — its plan ends with MarkReady, so it also -// resolves any stale sidecar. +// Image drift and TLS-enable drift co-compose into one NodeUpdate plan so +// a single pod cycle covers both. func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if node.Spec.Image != node.Status.CurrentImage { + if node.Spec.Image != node.Status.CurrentImage || sidecarTLSEnableDrift(node) { return buildNodeUpdatePlan(node) } if sidecarNeedsReapproval(node) { @@ -723,6 +746,21 @@ func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) return nil, nil } +// sidecarTLSEnableDrift reports whether the operator added spec.sidecar.tls +// post-creation and the live pod hasn't yet been rolled with TLS. +// Returns false unless the preflight has validated the Secret — the +// ResolvePlan gate also blocks plan creation when the condition is False, +// so this is belt-and-suspenders. +func sidecarTLSEnableDrift(node *seiv1alpha1.SeiNode) bool { + if !noderesource.SidecarTLSEnabled(node) { + return false + } + if node.Status.CurrentSidecarTLSSecretName == node.Spec.Sidecar.TLS.SecretName { + return false + } + return sidecarTLSSecretReady(node) +} + func sidecarNeedsReapproval(node *seiv1alpha1.SeiNode) bool { cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == "NotReady" @@ -742,23 +780,39 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// buildNodeUpdatePlan constructs a plan to roll out an image update on a -// Running node. The plan applies the new StatefulSet spec, waits for the -// rollout to complete, then re-initializes the sidecar. +// buildNodeUpdatePlan constructs a plan to roll out an image update, +// a TLS-enable transition, or both on a Running node. The plan applies +// the regenerated StatefulSet spec, cycles the pod, observes whichever +// fields drifted, then re-initializes the sidecar. // // FailedPhase is deliberately empty: a failure retries on the next reconcile // rather than transitioning the node out of Running. func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + imageDrift := node.Spec.Image != node.Status.CurrentImage + tlsEnable := sidecarTLSEnableDrift(node) setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", - fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) - - prog := []string{ + nodeUpdateMessage(node, imageDrift, tlsEnable)) + + var prog []string + if tlsEnable { + // The kube-rbac-proxy authz ConfigMap is controller-owned and + // must exist before ApplyStatefulSet schedules the pod with the + // proxy container mount. The TLS Secret itself is operator- + // provisioned and already validated by preflight. + prog = append(prog, task.TaskTypeApplyRBACProxyConfig) + } + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, task.TaskTypeReplacePod, - task.TaskTypeObserveImage, - sidecar.TaskTypeMarkReady, + ) + if imageDrift { + prog = append(prog, task.TaskTypeObserveImage) + } + if tlsEnable { + prog = append(prog, task.TaskTypeObserveSidecarTLS) } + prog = append(prog, sidecar.TaskTypeMarkReady) planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) @@ -777,6 +831,19 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro }, nil } +func nodeUpdateMessage(node *seiv1alpha1.SeiNode, imageDrift, tlsEnable bool) string { + switch { + case imageDrift && tlsEnable: + return fmt.Sprintf("image drift: spec=%s current=%s; tls enable: secretName=%s", + node.Spec.Image, node.Status.CurrentImage, node.Spec.Sidecar.TLS.SecretName) + case tlsEnable: + return fmt.Sprintf("tls enable: secretName=%s", node.Spec.Sidecar.TLS.SecretName) + default: + return fmt.Sprintf("image drift: spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage) + } +} + // mergeOverrides combines controller-generated overrides with user-specified // overrides. User overrides take precedence. func mergeOverrides(controllerOverrides, userOverrides map[string]string) map[string]string { diff --git a/internal/planner/sidecar_tls_enable_test.go b/internal/planner/sidecar_tls_enable_test.go new file mode 100644 index 0000000..cbbb826 --- /dev/null +++ b/internal/planner/sidecar_tls_enable_test.go @@ -0,0 +1,239 @@ +package planner + +import ( + "slices" + "testing" + + . "github.com/onsi/gomega" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" +) + +const ( + tlsEnableSecretName = "validator-0-tls" + tlsImageV1 = "sei:v1.0.0" + tlsImageV2 = "sei:v2.0.0" +) + +func tlsEnableRunningNode() *seiv1alpha1.SeiNode { + return &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + Sidecar: &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: tlsEnableSecretName}}, + }, + Status: seiv1alpha1.SeiNodeStatus{ + Phase: seiv1alpha1.PhaseRunning, + CurrentImage: tlsImageV1, + // CurrentSidecarTLSSecretName intentionally empty — TLS just enabled. + }, + } +} + +func setTLSSecretReady2(node *seiv1alpha1.SeiNode, status metav1.ConditionStatus) { + apimeta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ + Type: seiv1alpha1.ConditionSidecarTLSSecretReady, Status: status, Reason: seiv1alpha1.ReasonTLSSecretReady, + }) +} + +// --- sidecarTLSEnableDrift detector --- + +func TestSidecarTLSEnableDrift_FiresWhenSpecSetAndStatusEmptyAndReady(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + g.Expect(sidecarTLSEnableDrift(node)).To(BeTrue()) +} + +func TestSidecarTLSEnableDrift_FalseWhenAlreadyConverged(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + setTLSSecretReady2(node, metav1.ConditionTrue) + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse()) +} + +func TestSidecarTLSEnableDrift_FalseWhenPreflightNotReady(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionFalse) + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse(), + "detector must wait for preflight to validate the Secret before triggering a plan") +} + +func TestSidecarTLSEnableDrift_FalseWhenTLSDisabled(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Sidecar = nil + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse()) +} + +// --- buildRunningPlan composition --- + +func TestBuildRunningPlan_TLSEnableOnly(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + g.Expect(types).To(Equal([]string{ + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + })) +} + +func TestBuildRunningPlan_ImageAndTLSEnable_CoCompose(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Image = tlsImageV2 // image drift on top of TLS enable + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + g.Expect(types).To(Equal([]string{ + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + }), "both observers run in one plan; single pod cycle covers both drifts") +} + +func TestBuildRunningPlan_ImageOnlyDrift_TLSAlreadyConverged(t *testing.T) { + // TLS is currently live; image drift fires. ObserveSidecarTLS is + // still injected (idempotent) so SidecarURLForNode stays correct + // after the pod cycle re-mounts the Secret on the new pod. + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName // already converged + node.Spec.Image = tlsImageV2 // image drift only + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + // No TLS-enable drift here — ObserveSidecarTLS not added because + // drift detector returned false (status already mirrors spec). + g.Expect(types).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +// --- ResolvePlan condition lifecycle on TLS enable --- + +func TestResolvePlan_TLSEnable_SetsAndClearsCondition(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + + err := (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).NotTo(BeNil()) + + cond := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Message).To(ContainSubstring("tls enable")) + + // Simulate plan completion + observer stamping the status mirror. + node.Status.Plan.Phase = seiv1alpha1.TaskPlanComplete + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + + err = (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).To(BeNil()) + + cond = apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal("UpdateComplete")) +} + +// --- classifyPlan labels for the new variants --- + +func TestClassifyPlan_TLSEnableOnly(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("tls-enable")) +} + +func TestClassifyPlan_ImageAndTLSEnable(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Image = tlsImageV2 + setTLSSecretReady2(node, metav1.ConditionTrue) + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("node-update+tls-enable")) +} + +// --- init-plan ObserveSidecarTLS insertion --- + +func TestBuildBasePlan_InsertsObserveSidecarTLSBeforeSidecarProgressionWhenTLSEnabled(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + Sidecar: &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: tlsEnableSecretName}}, + }, + } + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + obsIdx := slices.Index(types, task.TaskTypeObserveSidecarTLS) + stsIdx := slices.Index(types, task.TaskTypeApplyStatefulSet) + configIdx := slices.Index(types, TaskConfigApply) + + g.Expect(obsIdx).To(BeNumerically(">=", 0), "observe-sidecar-tls must appear") + g.Expect(obsIdx).To(BeNumerically(">", stsIdx), "observe-sidecar-tls must follow ApplyStatefulSet") + g.Expect(obsIdx).To(BeNumerically("<", configIdx), "observe-sidecar-tls must precede the first sidecar HTTP task (config-apply)") +} + +func TestBuildBasePlan_NoObserveSidecarTLSWhenTLSDisabled(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(slices.Index(taskTypes(plan), task.TaskTypeObserveSidecarTLS)).To(Equal(-1), + "observe-sidecar-tls must NOT appear when TLS is disabled") +} diff --git a/internal/planner/sidecar_tls_test.go b/internal/planner/sidecar_tls_test.go index 954b744..852b37c 100644 --- a/internal/planner/sidecar_tls_test.go +++ b/internal/planner/sidecar_tls_test.go @@ -29,6 +29,7 @@ func TestSidecarURLForNode_TLSRoutesThroughProxyHTTPS(t *testing.T) { Spec: seiv1alpha1.SeiNodeSpec{Sidecar: &seiv1alpha1.SidecarConfig{ TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: testValidatorName + "-tls"}, }}, + Status: seiv1alpha1.SeiNodeStatus{CurrentSidecarTLSSecretName: testValidatorName + "-tls"}, } got := SidecarURLForNode(node) want := fmt.Sprintf("https://%s-0.%s.sei.svc.cluster.local:8443", testValidatorName, testValidatorName) @@ -37,6 +38,25 @@ func TestSidecarURLForNode_TLSRoutesThroughProxyHTTPS(t *testing.T) { } } +// TestSidecarURLForNode_TLSSpecButNotYetObserved exercises the mid-rollout +// transport state: spec.sidecar.tls is set but the pod hasn't been cycled +// (status.currentSidecarTLSSecretName empty). Controller MUST use plain +// HTTP against the still-HTTP pod until the rollout converges. +func TestSidecarURLForNode_TLSSpecButNotYetObserved(t *testing.T) { + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, + Spec: seiv1alpha1.SeiNodeSpec{Sidecar: &seiv1alpha1.SidecarConfig{ + TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: testValidatorName + "-tls"}, + }}, + // Status.CurrentSidecarTLSSecretName intentionally empty. + } + got := SidecarURLForNode(node) + want := fmt.Sprintf("http://%s-0.%s.sei.svc.cluster.local:7777", testValidatorName, testValidatorName) + if got != want { + t.Errorf("mid-rollout URL = %q, want %q (must stay HTTP until ObserveSidecarTLS stamps status)", got, want) + } +} + const ( testValidatorName = "validator-0" testSeidImage = "seid:v6.4.1" diff --git a/internal/task/observe_sidecar_tls.go b/internal/task/observe_sidecar_tls.go new file mode 100644 index 0000000..2aed704 --- /dev/null +++ b/internal/task/observe_sidecar_tls.go @@ -0,0 +1,81 @@ +package task + +import ( + "context" + "encoding/json" + "fmt" + + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" +) + +const TaskTypeObserveSidecarTLS = "observe-sidecar-tls" + +// ObserveSidecarTLSParams identifies the node whose StatefulSet rollout +// to observe. +type ObserveSidecarTLSParams struct { + NodeName string `json:"nodeName"` + Namespace string `json:"namespace"` +} + +type observeSidecarTLSExecution struct { + taskBase + params ObserveSidecarTLSParams + cfg ExecutionConfig +} + +func deserializeObserveSidecarTLS(id string, params json.RawMessage, cfg ExecutionConfig) (TaskExecution, error) { + var p ObserveSidecarTLSParams + if len(params) > 0 { + if err := json.Unmarshal(params, &p); err != nil { + return nil, fmt.Errorf("deserializing observe-sidecar-tls params: %w", err) + } + } + return &observeSidecarTLSExecution{ + taskBase: taskBase{id: id, status: ExecutionRunning}, + params: p, + cfg: cfg, + }, nil +} + +// Execute polls the StatefulSet rollout. Once converged, stamps +// status.currentSidecarTLSSecretName from spec so the controller's +// sidecar client transport-mode selection (SidecarURLForNode) picks +// up the new live state before the plan's MarkReady call fires. +func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { + node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) + if err != nil { + return Terminal(err) + } + if !noderesource.SidecarTLSEnabled(node) { + return Terminal(fmt.Errorf("observe-sidecar-tls scheduled but spec.sidecar.tls is nil")) + } + + sts := &appsv1.StatefulSet{} + key := types.NamespacedName{Name: node.Name, Namespace: node.Namespace} + if err := e.cfg.APIReader.Get(ctx, key, sts); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("getting statefulset: %w", err) + } + + if sts.Status.ObservedGeneration < sts.Generation { + return nil + } + if sts.Spec.Replicas == nil || sts.Status.UpdatedReplicas < *sts.Spec.Replicas { + return nil + } + + node.Status.CurrentSidecarTLSSecretName = node.Spec.Sidecar.TLS.SecretName + e.complete() + return nil +} + +func (e *observeSidecarTLSExecution) Status(_ context.Context) ExecutionStatus { + return e.DefaultStatus() +} diff --git a/internal/task/observe_sidecar_tls_test.go b/internal/task/observe_sidecar_tls_test.go new file mode 100644 index 0000000..6c998ff --- /dev/null +++ b/internal/task/observe_sidecar_tls_test.go @@ -0,0 +1,120 @@ +package task + +import ( + "context" + "encoding/json" + "errors" + "testing" + + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +func observeTLSNode(secretName string) *seiv1alpha1.SeiNode { + n := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testReplaceSTS, Namespace: testReplaceNs}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: opkChainID, + Image: opkImage, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + if secretName != "" { + n.Spec.Sidecar = &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: secretName}} + } + return n +} + +func observeTLSCfg(t *testing.T, node *seiv1alpha1.SeiNode, sts *appsv1.StatefulSet) ExecutionConfig { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatal(err) + } + if err := seiv1alpha1.AddToScheme(s); err != nil { + t.Fatal(err) + } + builder := fake.NewClientBuilder().WithScheme(s) + if sts != nil { + builder = builder.WithObjects(sts) + } + c := builder.Build() + return ExecutionConfig{ + KubeClient: c, + APIReader: c, + Scheme: s, + Resource: node, + } +} + +func newObserveTLSExec(t *testing.T, cfg ExecutionConfig) TaskExecution { + t.Helper() + params := ObserveSidecarTLSParams{NodeName: testReplaceSTS, Namespace: testReplaceNs} + raw, _ := json.Marshal(params) + exec, err := deserializeObserveSidecarTLS("obs-tls-1", raw, cfg) + if err != nil { + t.Fatal(err) + } + return exec +} + +func TestObserveSidecarTLS_RolloutComplete_StampsCurrent(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionComplete)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(Equal("my-tls-cert")) +} + +func TestObserveSidecarTLS_RolloutInProgress_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 0, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(BeEmpty()) +} + +func TestObserveSidecarTLS_StatefulSetNotFound_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestObserveSidecarTLS_TLSDisabled_TerminalError(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("") // TLS not enabled + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, nil)) + + err := exec.Execute(context.Background()) + g.Expect(err).To(HaveOccurred()) + var te *TerminalError + g.Expect(errors.As(err, &te)).To(BeTrue(), + "scheduling observe-sidecar-tls without spec.sidecar.tls is a planner bug; surface as Terminal") +} diff --git a/internal/task/task.go b/internal/task/task.go index ff3b7ef..bebd72f 100644 --- a/internal/task/task.go +++ b/internal/task/task.go @@ -221,6 +221,7 @@ var registry = map[string]taskDeserializer{ TaskTypeApplyRBACProxyConfig: deserializeApplyRBACProxyConfig, TaskTypeReplacePod: deserializeReplacePod, TaskTypeObserveImage: deserializeObserveImage, + TaskTypeObserveSidecarTLS: deserializeObserveSidecarTLS, TaskTypeValidateSigningKey: deserializeValidateSigningKey, TaskTypeValidateNodeKey: deserializeValidateNodeKey, TaskTypeValidateOperatorKeyring: deserializeValidateOperatorKeyring, diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 3f564f2..7760100 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -895,11 +895,12 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the - SeiNode to change TLS configuration + - message: 'spec.sidecar.tls is set-once: enabling on an existing + SeiNode is allowed, but disabling or changing requires delete + + recreate' rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) - ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) - && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)' + ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls + == oldSelf.sidecar.tls)' required: - spec type: object diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 421e1c4..dd848aa 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -766,11 +766,11 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the SeiNode - to change TLS configuration - rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) - || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) - && self.sidecar.tls == oldSelf.sidecar.tls)' + - message: 'spec.sidecar.tls is set-once: enabling on an existing SeiNode + is allowed, but disabling or changing requires delete + recreate' + rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : + (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == + oldSelf.sidecar.tls)' status: description: SeiNodeStatus defines the observed state of a SeiNode. properties: @@ -841,6 +841,13 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLSSecretName: + description: |- + CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName + observed on the live pod after the rollout converged. Drives the + controller's sidecar client transport-mode selection (HTTP vs + HTTPS). Empty when no TLS rollout has converged yet. + type: string externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node, From e28c92b47de31ac7a4372f167c7754c1f0eb2723 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 18 May 2026 08:17:21 -0700 Subject: [PATCH 2/3] fixup(seinode): round-1 cross-review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Blockers: - cmd/main.go: newSidecarClient now keys the TLS Doer on Status.CurrentSidecarTLSSecretName (not spec). Symmetric source of truth with SidecarURLForNode. Closes the mid-rollout window where mTLS-attempting transport would hit plain HTTP and the bootstrap-pod Phase-2 case where the bootstrap pod has no proxy. k8s-flagged blocker — corrected platform's "non-blocker" framing. - LLD §0.1(b), §4, §5, §6 rewritten to reflect set-once semantics (enable-on-existing allowed; disable + swap CEL-rejected). Doc contradiction with the implementation was the platform + security blocker; design history captured so future readers see the spec is set-once-not-immutable and why. - Document load-bearing trust assumption: Status.CurrentSidecarTLS- SecretName drives controller transport; manifests/role.yaml scopes seinodes/status patch to the controller SA only. Followup tracked. Non-blockers / nits: - ObserveSidecarTLS now requires ReadyReplicas >= *Replicas (k8s). Stricter than ObserveImage: stamping status flips SidecarURLForNode to HTTPS, so a crashlooping proxy must not flip transport prematurely. New test covers the crashloop scenario. - classifyPlan label node-update+tls-enable → node-update-tls-enable (security). Hyphen avoids PromQL/dashboard regex-escape footguns. - sidecarTLSEnableDrift now has an inline nil-guard (k8s). Removes implicit dependency on SidecarTLSEnabled invariants. - buildBasePlan: consolidate two adjacent SidecarTLSEnabled blocks with ordering comment (platform). - buildBootstrapPlan: clarify ObserveSidecarTLS comment now that the Phase-2 (bootstrap-pod, HTTP) vs Phase-5 (production-pod, HTTPS) split is explicit (k8s). - Test comment contradiction fixed (k8s). - New SidecarURLForNode test: status-set-without-spec quadrant. Deferred (tracked): - ValidatingAdmissionPolicy on seinodes/status writes (security #263). - envtest harness for CEL transitions (k8s #264). - Shared rollout-readiness primitive ObserveImage/ObserveSidecarTLS (platform — defer to when a third observer lands). Co-Authored-By: Claude Opus 4.7 (1M context) --- cmd/main.go | 7 +- docs/design-seinode-sidecar-tls-toggle-lld.md | 69 ++++++++++--------- internal/planner/bootstrap.go | 10 ++- internal/planner/planner.go | 18 ++--- internal/planner/sidecar_tls_enable_test.go | 14 ++-- internal/planner/sidecar_tls_test.go | 17 +++++ internal/task/observe_sidecar_tls.go | 10 ++- internal/task/observe_sidecar_tls_test.go | 28 +++++++- 8 files changed, 117 insertions(+), 56 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index ed67e56..5cf132c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -176,7 +176,12 @@ func main() { newSidecarClient := func(node *seiv1alpha1.SeiNode) (*sidecar.SidecarClient, error) { url := noderesource.SidecarURLForNode(node) - if noderesource.SidecarTLSEnabled(node) { + // Key the transport on the same observed-state field that + // SidecarURLForNode uses, not on spec. Mid-rollout (spec set, + // pod not yet cycled) the URL is still HTTP and the transport + // must match — otherwise mTLS-attempting doer hits an HTTP + // endpoint and the handshake fails. + if node.Status.CurrentSidecarTLSSecretName != "" { return sidecar.NewSidecarClient(url, sidecar.WithHTTPDoer(sharedTLSDoer)) } return sidecar.NewSidecarClient(url) diff --git a/docs/design-seinode-sidecar-tls-toggle-lld.md b/docs/design-seinode-sidecar-tls-toggle-lld.md index 6bc5816..89074c3 100644 --- a/docs/design-seinode-sidecar-tls-toggle-lld.md +++ b/docs/design-seinode-sidecar-tls-toggle-lld.md @@ -15,28 +15,29 @@ The first attempt at supporting TLS toggle on Running nodes (PR #254) added a dr This revision drops that approach entirely. The cert is no longer the controller's concern; it becomes platform-provisioned, mirroring how `spec.validator.signingKey`, `spec.validator.nodeKey`, `spec.validator.operatorKeyring`, and `spec.dataVolume.import.pvcName` already work. -## 0.1 Design choice: externalize cert ownership; spec is immutable +## 0.1 Design choice: externalize cert ownership; spec is set-once -Three coupled decisions, each enabling the next: +Three coupled decisions: **a) The controller does not provision the cert.** Operator/platform tooling (cert-manager `Certificate` in GitOps, or any other source) creates the `kubernetes.io/tls` Secret. The controller references it by name and gates progression on presence + cert validity. This matches every other security-sensitive resource in the SeiNode spec (`signingKey`, `nodeKey`, `operatorKeyring`, imported `PVC`). -**b) `spec.sidecar.tls` is immutable post-creation.** Toggling TLS on an existing SeiNode is *not* a controller-orchestrated transition — it's a delete + recreate. The data PVC is retained externally (today via `spec.dataVolume.import.pvcName` pre-detach; future-cleaner via `spec.dataVolume.retainOnDelete`). At ~10 nodes for the foreseeable internal fleet, manual cycling is operationally fine and gives each transition a clean Pending → Running boundary instead of a mid-flight transport-mode flip. +**b) `spec.sidecar.tls` is set-once.** Once set, the value is immutable: disable (`set → nil`) and swap (`A → B`) are rejected at admission via CEL. The one allowed transition is **enable post-creation** (`nil → set`), which triggers a NodeUpdate-style plan that cycles the pod with the proxy mount. This decision evolved across two PRs: #258 made the spec fully immutable on the rationale that delete + recreate at ~10 nodes was operationally fine; #262 un-deferred the enable case after operator validation that per-node delete + recreate is more friction than warranted, but kept disable and swap blocked because they remain genuinely irreversible (swap in particular implies a new trust anchor, which warrants a clean Pending → Running boundary). The transport-mode-flip-during-cycle concern is addressed by sourcing transport selection from `status.currentSidecarTLSSecretName` (observed) rather than spec. **c) The cert→SeiNode contract is machine-readable, not docs-only.** The controller publishes `status.sidecarTLS.{secretName, requiredDNSNames}` whenever TLS is enabled. Platform tooling reads this directly rather than re-deriving the K8s service DNS convention from documentation. The controller validates the Secret's cert SANs against this list before allowing the pod to schedule. -What collapses under these decisions: -- The drift detector (`sidecarTLSDrift`) — gone; immutable spec means no drift on Running nodes. -- The status mirror (`status.currentSidecarTLS`) — gone; immutable spec means status = spec for TLS. -- The condition reason expansion (`TLSToggleStarted`, `UpdateAndTLSToggleStarted`) — gone; toggle isn't a NodeUpdate. -- `classifyPlan` discrimination — back to its pre-PR shape. -- The `ApplySidecarCert` task and `GenerateSidecarCertificate` resource generator — gone; cert is external. -- The `ObserveSidecarTLS` task — gone; nothing to observe on a Running node. -- The Issuer trust-anchor security surface (`SidecarTLSSpec.IssuerName`/`IssuerKind`) — gone; trust-anchor selection happens entirely in platform tooling, outside the controller's blast radius. +**d) Status is the source of truth for transport selection.** `SidecarURLForNode` reads `status.currentSidecarTLSSecretName` (observed), not `spec.sidecar.tls` (desired). The controller's sidecar client transport (cmd/main.go `newSidecarClient`) keys off the same observed field. Mid-rollout, the controller talks HTTP to the still-HTTP pod; after `ObserveSidecarTLS` stamps the mirror, both URL and transport flip together. This is the load-bearing fix for the windowing bug that bit the original PR #254 attempt. -What stays: -- `noderesource` pod/service generation with the existing `if SidecarTLSEnabled(node) { ... add proxy }` branching. -- `ApplyRBACProxyConfig` task (the kube-rbac-proxy authz ConfigMap is controller-owned and depends on SeiNode namespace/name). +**Load-bearing trust assumption:** the controller's `SidecarURLForNode` consumes `status.currentSidecarTLSSecretName`. Any actor with `seinodes/status` patch permission could flip the controller's transport selection (DoS on the reconcile loop, not data exfil — the sidecar still enforces its own auth). The existing RBAC (`manifests/role.yaml`) scopes status-write to the controller's own ServiceAccount only. This is canonical for controller-runtime managers but is now load-bearing; an admission-time check (ValidatingAdmissionPolicy) preventing non-controller writes to this field would be defense in depth. + +What lives in the controller under these decisions: +- Drift detector `sidecarTLSEnableDrift` — narrow: spec set + status empty + preflight `SidecarTLSSecretReady=True`. Disable and swap aren't detector-handled because CEL blocks them at admission. +- Status mirror `status.currentSidecarTLSSecretName` — drives transport selection; stamped only on rollout convergence (`StatefulSet.Status.ReadyReplicas >= *Replicas`) so a crashlooping proxy doesn't flip transport prematurely. +- `ObserveSidecarTLS` task — polls StatefulSet rollout and stamps the mirror; injected before any sidecar HTTP task in every plan that brings up a TLS-enabled pod. +- `ApplyRBACProxyConfig` task — controller-owned (the ConfigMap's SAR depends on namespace/name). + +What stays externalized: +- The `kubernetes.io/tls` Secret containing cert + key. +- Trust-anchor selection (Issuer / ClusterIssuer reference) — happens entirely in whatever produces the Secret. ## 1. CRD schema changes @@ -45,13 +46,9 @@ What stays: // SidecarTLSSpec, if set, fronts the sidecar API with kube-rbac-proxy // on :8443 using TLS material from a Secret in the SeiNode's -// namespace. The Secret is operator-provisioned (e.g., via a -// cert-manager Certificate in the platform GitOps repo); this -// controller does not create it. -// -// Immutable post-creation. Toggling TLS on an existing SeiNode is a -// delete + recreate operation; data persists via the SeiNode's PVC -// retention mechanism. +// namespace. Operator-provisioned; the controller does not create it. +// Set-once: enabling on an existing SeiNode is allowed (triggers a +// NodeUpdate-style pod cycle); disable and swap are CEL-rejected. // // +optional TLS *SidecarTLSSpec `json:"tls,omitempty"` @@ -191,33 +188,37 @@ There is no `WaitForSidecarTLSSecret` task in the plan — its job is absorbed i ## 4. Running-state behavior -`buildRunningPlan` reverts to its pre-PR-#254 shape: image drift + sidecar re-approval, no TLS handling. There is no TLS drift to detect. +`buildRunningPlan` detects two drifts: image drift (existing) and TLS-enable drift (added via #262). Both compose into one NodeUpdate plan when they co-fire; a single pod cycle covers both. -If an operator attempts to mutate `spec.sidecar.tls`, the CRD CEL rejects the API request — no controller code runs. +TLS-enable drift fires only on the `nil → set` transition (operator added `spec.sidecar.tls` to an existing SeiNode that previously had none). Disable and swap are CEL-rejected at admission; the detector never sees them. The plan composes `ApplyRBACProxyConfig → ApplyStatefulSet → ApplyService → ReplacePod → [ObserveImage if image drift] → ObserveSidecarTLS → MarkReady`. `ObserveSidecarTLS` polls StatefulSet rollout convergence (requires `ReadyReplicas >= *Replicas` so a crashlooping proxy doesn't flip transport prematurely), then stamps `status.currentSidecarTLSSecretName`. If the externally-provisioned Secret rotates in place (cert-manager renewal with the same SANs): kube-rbac-proxy's existing `--tls-reload-interval=30s` flag picks up the new material from the same Secret mount; no pod restart needed; no controller action. Pre-flight re-runs each reconcile and continues stamping `SidecarTLSSecretReady=True`. -If the Secret SAN coverage breaks (wrong SANs after a misconfigured re-issuance, Secret deleted, etc.): pre-flight flips the condition to `SidecarTLSSecretReady=False`. Plan creation is gated — any plan that fires under a broken Secret would eventually cycle the pod (image-drift NodeUpdate plans always do; even a mark-ready replan ultimately retries the sidecar HTTP call through the proxy), and a cycled pod with a missing or mis-SAN'd Secret won't bind cleanly. The Running node keeps serving on whatever cert kube-rbac-proxy is already bound to (no controller action can force a reload-from-bad-Secret), so the user-visible effect is "running pod stays as-is, no new rollouts until the operator fixes the Secret." +If the Secret SAN coverage breaks (wrong SANs after a misconfigured re-issuance, Secret deleted, etc.): pre-flight flips the condition to `SidecarTLSSecretReady=False`. Plan creation is gated for any TLS-required SeiNode whose condition is not Ready. The Running node keeps serving on whatever cert kube-rbac-proxy is already bound to (no controller action can force a reload-from-bad-Secret), so the user-visible effect is "running pod stays as-is, no new rollouts until the operator fixes the Secret." + +**Mid-rollout transport invariant:** `SidecarURLForNode` reads `status.currentSidecarTLSSecretName`; cmd/main.go's `newSidecarClient` keys its HTTP-doer choice on the same field. The two move in lockstep. During the enable rollout, both stay on HTTP while the old pod still listens on `:7777`; once `ObserveSidecarTLS` stamps the mirror, both flip to HTTPS for the new pod. + +## 5. Operator workflow -## 5. Operator workflow for "enable TLS on existing fleet" +**Enable TLS at SeiNode creation:** apply a SeiNode with `spec.sidecar.tls.secretName` referencing a pre-provisioned `kubernetes.io/tls` Secret. SeiNode stays `Pending` with `SidecarTLSSecretReady=False/NotFound` until the Secret resolves; init plan runs once the condition flips to True. -The arctic-1 / atlantic-2 / pacific-1 use case: +**Enable TLS on an existing Running SeiNode** (the arctic-1 / atlantic-2 / pacific-1 fleet path): -1. Platform tooling pre-provisions a `kubernetes.io/tls` Secret per SeiNode (cert-manager `Certificate` resources, GitOps-managed) with SANs matching the published `status.sidecarTLS.requiredDNSNames` — readable in advance from a dry-run of the target spec or from a known template. -2. Per SeiNode (one at a time, governed by ops policy): - a. Detach the data PVC (existing `spec.dataVolume.import.pvcName` flow, or set `spec.dataVolume.retainOnDelete: true` if that lands first). - b. Delete the SeiNode. - c. Create a new SeiNode with the same name, `spec.sidecar.tls.secretName` set, and `spec.dataVolume.import.pvcName` pointing at the retained PVC. - d. Wait for `Phase=Running`. +1. Platform tooling provisions a `kubernetes.io/tls` Secret with SANs matching `status.sidecarTLS.requiredDNSNames` (pre-computable as `{name}.{ns}.svc.cluster.local` + `{name}-0.{name}.{ns}.svc.cluster.local`). +2. Operator patches `spec.sidecar.tls.secretName` on the running SeiNode. +3. Preflight validates the Secret on the next reconcile; condition flips to True. +4. `sidecarTLSEnableDrift` fires; planner builds a NodeUpdate plan composing the proxy `ConfigMap` apply + StatefulSet/Service regen + pod cycle + observer + MarkReady. +5. SeiNode stays in `Phase=Running` throughout; transport selection lags spec until `ObserveSidecarTLS` confirms `ReadyReplicas`. -At ~10 nodes total, this is a one-time script. No SND-level orchestration changes needed; the SND already supports its existing rollout primitives. +**Disable or swap:** delete + recreate the SeiNode. PVC retention via existing `spec.dataVolume.import.pvcName` (pre-detach) or future `spec.dataVolume.retainOnDelete`. Disable and swap remain CEL-rejected on existing objects because they require a clean Pending → Running boundary that mid-flight orchestration cannot provide. ## 6. Out of scope (deferred or unrelated) - **`spec.dataVolume.retainOnDelete: bool`** as a one-step PVC retention primitive (replaces the two-step pre-detach workflow). Small follow-up issue. Not blocking. - **SND-level retain-on-delete** ("delete the SND, keep the SeiNodes") — independently useful as a "drop the orchestrator, keep the fleet" escape hatch but unrelated to this LLD. Separate follow-up if desired. -- **In-place TLS toggle on Running SeiNodes** — explicitly rejected in §0.1(b). Re-open as a separate design if the fleet scale or operational model changes such that delete + recreate is no longer feasible. +- **In-place TLS disable or swap on Running SeiNodes** — CEL-rejected. Re-open as a separate design only if a coherent rollout path exists; today neither has one (disable would leave the proxy in place mid-flight serving stale state; swap implies a trust-anchor change that warrants Pending → Running). - **CA rotation observability** (cert fingerprint, SecretResourceVersion in status) — moot under the externalized-cert model; cert content is not the controller's concern. +- **Admission-policy hardening of `status.currentSidecarTLSSecretName` writes.** The current trust assumption is that only the controller's ServiceAccount has `seinodes/status` patch permission. A ValidatingAdmissionPolicy preventing non-controller writes to this specific field would be defense in depth. Tracked separately. ## 7. Cross-cutting (operational notes) diff --git a/internal/planner/bootstrap.go b/internal/planner/bootstrap.go index 4da0122..d0645a7 100644 --- a/internal/planner/bootstrap.go +++ b/internal/planner/bootstrap.go @@ -113,8 +113,14 @@ func buildBootstrapPlan( return nil, err } if noderesource.SidecarTLSEnabled(node) { - // Inject before sidecar HTTP tasks so SidecarURLForNode picks up - // TLS transport (via status.currentSidecarTLSSecretName) first. + // Inject between the production-pod StatefulSet apply and the + // Phase-5 post-bootstrap sidecar progression. Phase-2 bootstrap + // sidecar tasks run earlier against the bootstrap pod (plain + // HTTP, no proxy); status.CurrentSidecarTLSSecretName is empty + // at that point so SidecarURLForNode and newSidecarClient both + // use HTTP transport. After this task stamps the status mirror, + // Phase-5 sidecar tasks switch to HTTPS against the production + // pod's kube-rbac-proxy. if err := appendTask(task.TaskTypeObserveSidecarTLS, &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { return nil, err diff --git a/internal/planner/planner.go b/internal/planner/planner.go index ccbc120..dc9a526 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -236,7 +236,7 @@ func classifyPlan(plan *seiv1alpha1.TaskPlan) string { } switch { case hasImage && hasTLS: - return "node-update+tls-enable" + return "node-update-tls-enable" case hasImage: return "node-update" case hasTLS: @@ -555,15 +555,15 @@ func buildBasePlan( prog = append(prog, task.TaskTypeValidateOperatorKeyring) } if noderesource.SidecarTLSEnabled(node) { + // ApplyRBACProxyConfig must precede ApplyStatefulSet (proxy + // mounts the ConfigMap). ObserveSidecarTLS must precede the + // sidecar HTTP progression so SidecarURLForNode picks up the + // TLS transport before SnapshotRestore / ConfigApply / MarkReady. prog = append(prog, task.TaskTypeApplyRBACProxyConfig) - } - prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) - if noderesource.SidecarTLSEnabled(node) { - // Insert before the sidecar HTTP progression so SidecarURLForNode - // picks up the TLS transport mode (via the in-memory mutation of - // status.currentSidecarTLSSecretName) before the first sidecar - // HTTP call (SnapshotRestore / ConfigApply / MarkReady) fires. + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) prog = append(prog, task.TaskTypeObserveSidecarTLS) + } else { + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) } prog = append(prog, sidecarProg...) @@ -752,7 +752,7 @@ func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) // ResolvePlan gate also blocks plan creation when the condition is False, // so this is belt-and-suspenders. func sidecarTLSEnableDrift(node *seiv1alpha1.SeiNode) bool { - if !noderesource.SidecarTLSEnabled(node) { + if node.Spec.Sidecar == nil || node.Spec.Sidecar.TLS == nil { return false } if node.Status.CurrentSidecarTLSSecretName == node.Spec.Sidecar.TLS.SecretName { diff --git a/internal/planner/sidecar_tls_enable_test.go b/internal/planner/sidecar_tls_enable_test.go index cbbb826..76a33f8 100644 --- a/internal/planner/sidecar_tls_enable_test.go +++ b/internal/planner/sidecar_tls_enable_test.go @@ -118,9 +118,10 @@ func TestBuildRunningPlan_ImageAndTLSEnable_CoCompose(t *testing.T) { } func TestBuildRunningPlan_ImageOnlyDrift_TLSAlreadyConverged(t *testing.T) { - // TLS is currently live; image drift fires. ObserveSidecarTLS is - // still injected (idempotent) so SidecarURLForNode stays correct - // after the pod cycle re-mounts the Secret on the new pod. + // Image drift on a node whose TLS is already converged. TLS-enable + // drift detector returns false (status mirrors spec), so the plan + // omits ApplyRBACProxyConfig and ObserveSidecarTLS — the existing + // ConfigMap and status mirror stand from the prior plan. g := NewWithT(t) node := tlsEnableRunningNode() node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName // already converged @@ -131,10 +132,7 @@ func TestBuildRunningPlan_ImageOnlyDrift_TLSAlreadyConverged(t *testing.T) { g.Expect(err).NotTo(HaveOccurred()) g.Expect(plan).NotTo(BeNil()) - types := taskTypes(plan) - // No TLS-enable drift here — ObserveSidecarTLS not added because - // drift detector returned false (status already mirrors spec). - g.Expect(types).To(Equal([]string{ + g.Expect(taskTypes(plan)).To(Equal([]string{ task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, task.TaskTypeReplacePod, @@ -191,7 +189,7 @@ func TestClassifyPlan_ImageAndTLSEnable(t *testing.T) { setTLSSecretReady2(node, metav1.ConditionTrue) plan, err := buildRunningPlan(node) g.Expect(err).NotTo(HaveOccurred()) - g.Expect(classifyPlan(plan)).To(Equal("node-update+tls-enable")) + g.Expect(classifyPlan(plan)).To(Equal("node-update-tls-enable")) } // --- init-plan ObserveSidecarTLS insertion --- diff --git a/internal/planner/sidecar_tls_test.go b/internal/planner/sidecar_tls_test.go index 852b37c..0034039 100644 --- a/internal/planner/sidecar_tls_test.go +++ b/internal/planner/sidecar_tls_test.go @@ -57,6 +57,23 @@ func TestSidecarURLForNode_TLSSpecButNotYetObserved(t *testing.T) { } } +// TestSidecarURLForNode_StatusSetWithoutSpec exercises an edge case where +// the status mirror is non-empty but spec.sidecar.tls is nil (e.g., a +// hypothetical future disable transition mid-flight). The URL function +// reads observed state only, so the controller still routes through TLS +// until the next observer cycle clears the status mirror. +func TestSidecarURLForNode_StatusSetWithoutSpec(t *testing.T) { + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, + Status: seiv1alpha1.SeiNodeStatus{CurrentSidecarTLSSecretName: testValidatorName + "-tls"}, + } + got := SidecarURLForNode(node) + want := fmt.Sprintf("https://%s-0.%s.sei.svc.cluster.local:8443", testValidatorName, testValidatorName) + if got != want { + t.Errorf("URL = %q, want %q (status mirror is the source of truth)", got, want) + } +} + const ( testValidatorName = "validator-0" testSeidImage = "seid:v6.4.1" diff --git a/internal/task/observe_sidecar_tls.go b/internal/task/observe_sidecar_tls.go index 2aed704..9437d04 100644 --- a/internal/task/observe_sidecar_tls.go +++ b/internal/task/observe_sidecar_tls.go @@ -67,7 +67,15 @@ func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { if sts.Status.ObservedGeneration < sts.Generation { return nil } - if sts.Spec.Replicas == nil || sts.Status.UpdatedReplicas < *sts.Spec.Replicas { + if sts.Spec.Replicas == nil { + return nil + } + // Stricter than ObserveImage: require ReadyReplicas (kube-rbac-proxy + // passed its readiness probe), not just UpdatedReplicas (template + // applied). Stamping the status mirror flips SidecarURLForNode to + // HTTPS; if the proxy is still crash-looping, MarkReady would hit + // connection-refused. Gate on actual proxy serving. + if sts.Status.UpdatedReplicas < *sts.Spec.Replicas || sts.Status.ReadyReplicas < *sts.Spec.Replicas { return nil } diff --git a/internal/task/observe_sidecar_tls_test.go b/internal/task/observe_sidecar_tls_test.go index 6c998ff..0574abf 100644 --- a/internal/task/observe_sidecar_tls_test.go +++ b/internal/task/observe_sidecar_tls_test.go @@ -70,7 +70,9 @@ func TestObserveSidecarTLS_RolloutComplete_StampsCurrent(t *testing.T) { sts := &appsv1.StatefulSet{ ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, - Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 1, Replicas: 1}, + Status: appsv1.StatefulSetStatus{ + ObservedGeneration: 2, UpdatedReplicas: 1, ReadyReplicas: 1, Replicas: 1, + }, } exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) @@ -80,6 +82,30 @@ func TestObserveSidecarTLS_RolloutComplete_StampsCurrent(t *testing.T) { g.Expect(node.Status.CurrentSidecarTLSSecretName).To(Equal("my-tls-cert")) } +// TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning is the +// crashlooping-proxy scenario: kubelet has applied the new pod template +// (UpdatedReplicas=1) but the proxy container's readiness probe is failing +// (ReadyReplicas=0). The observer must NOT stamp status — otherwise +// SidecarURLForNode would flip to HTTPS and MarkReady would hit +// connection-refused against the unready proxy. +func TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ + ObservedGeneration: 2, UpdatedReplicas: 1, ReadyReplicas: 0, Replicas: 1, + }, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(BeEmpty()) +} + func TestObserveSidecarTLS_RolloutInProgress_StaysRunning(t *testing.T) { g := NewWithT(t) node := observeTLSNode("my-tls-cert") From 1f3367379cd03bd3a19772708d976357a477029e Mon Sep 17 00:00:00 2001 From: bdchatham Date: Mon, 18 May 2026 08:23:26 -0700 Subject: [PATCH 3/3] fixup(seinode): trim comments Strip rationale paragraphs, cross-references, historical framing, and restatements of what the code obviously does. Keep present-tense one-liners where the WHY is non-obvious. Co-Authored-By: Claude Opus 4.7 (1M context) --- api/v1alpha1/seinode_types.go | 8 ++--- cmd/main.go | 6 +--- internal/noderesource/noderesource.go | 9 ++--- internal/planner/bootstrap.go | 12 ++----- internal/planner/planner.go | 39 +++++++-------------- internal/planner/sidecar_tls_enable_test.go | 14 +++----- internal/planner/sidecar_tls_test.go | 17 ++++----- internal/task/observe_sidecar_tls.go | 13 +++---- internal/task/observe_sidecar_tls_test.go | 8 ++--- 9 files changed, 38 insertions(+), 88 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 8cc8c82..b19b797 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -10,7 +10,7 @@ import ( // the populated field determines the node's operating mode. // +kubebuilder:validation:XValidation:rule="(has(self.fullNode) ? 1 : 0) + (has(self.archive) ? 1 : 0) + (has(self.replayer) ? 1 : 0) + (has(self.validator) ? 1 : 0) == 1",message="exactly one of fullNode, archive, replayer, or validator must be set" // +kubebuilder:validation:XValidation:rule="!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)",message="peers is required when replayer mode is set" -// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is set-once: enabling on an existing SeiNode is allowed, but disabling or changing requires delete + recreate" +// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is set-once; disable and swap require delete + recreate" type SeiNodeSpec struct { // ChainID of the chain this node belongs to. // +kubebuilder:validation:MinLength=1 @@ -363,10 +363,8 @@ type SeiNodeStatus struct { // +optional SidecarTLS *SidecarTLSStatus `json:"sidecarTLS,omitempty"` - // CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName - // observed on the live pod after the rollout converged. Drives the - // controller's sidecar client transport-mode selection (HTTP vs - // HTTPS). Empty when no TLS rollout has converged yet. + // CurrentSidecarTLSSecretName is the secretName observed on the + // live pod. Empty when TLS hasn't rolled out yet. // +optional CurrentSidecarTLSSecretName string `json:"currentSidecarTLSSecretName,omitempty"` } diff --git a/cmd/main.go b/cmd/main.go index 5cf132c..b67196b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -176,11 +176,7 @@ func main() { newSidecarClient := func(node *seiv1alpha1.SeiNode) (*sidecar.SidecarClient, error) { url := noderesource.SidecarURLForNode(node) - // Key the transport on the same observed-state field that - // SidecarURLForNode uses, not on spec. Mid-rollout (spec set, - // pod not yet cycled) the URL is still HTTP and the transport - // must match — otherwise mTLS-attempting doer hits an HTTP - // endpoint and the handshake fails. + // Same observed-state predicate as SidecarURLForNode. if node.Status.CurrentSidecarTLSSecretName != "" { return sidecar.NewSidecarClient(url, sidecar.WithHTTPDoer(sharedTLSDoer)) } diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 30913c4..0da2d31 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -1155,12 +1155,9 @@ func GenerateRBACProxyConfigMap(node *seiv1alpha1.SeiNode) *corev1.ConfigMap { } } -// SidecarURLForNode builds the in-cluster URL the controller uses to -// reach a node's sidecar. Routes through kube-rbac-proxy on HTTPS :8443 -// when the live pod is serving TLS; otherwise plain HTTP on the sidecar -// port. Reads the OBSERVED state (Status.CurrentSidecarTLSSecretName), -// not spec, so a mid-rollout SeiNode (spec.tls just set, pod not yet -// cycled) keeps using HTTP until the new pod is up. +// SidecarURLForNode returns the in-cluster URL for a node's sidecar. +// HTTPS :8443 via kube-rbac-proxy when status reports TLS live; +// otherwise plain HTTP on the sidecar port. func SidecarURLForNode(node *seiv1alpha1.SeiNode) string { scheme, port := "http", SidecarPort(node) if node.Status.CurrentSidecarTLSSecretName != "" { diff --git a/internal/planner/bootstrap.go b/internal/planner/bootstrap.go index d0645a7..b49f79b 100644 --- a/internal/planner/bootstrap.go +++ b/internal/planner/bootstrap.go @@ -113,14 +113,8 @@ func buildBootstrapPlan( return nil, err } if noderesource.SidecarTLSEnabled(node) { - // Inject between the production-pod StatefulSet apply and the - // Phase-5 post-bootstrap sidecar progression. Phase-2 bootstrap - // sidecar tasks run earlier against the bootstrap pod (plain - // HTTP, no proxy); status.CurrentSidecarTLSSecretName is empty - // at that point so SidecarURLForNode and newSidecarClient both - // use HTTP transport. After this task stamps the status mirror, - // Phase-5 sidecar tasks switch to HTTPS against the production - // pod's kube-rbac-proxy. + // Phase-2 tasks ran against the bootstrap pod (HTTP, no proxy); + // this stamps the status mirror so Phase-5 tasks use HTTPS. if err := appendTask(task.TaskTypeObserveSidecarTLS, &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { return nil, err @@ -204,8 +198,6 @@ func buildGenesisPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) task.TaskTypeApplyService, ) if noderesource.SidecarTLSEnabled(node) { - // Inject before sidecar HTTP tasks so SidecarURLForNode picks up - // TLS transport before the first sidecar call. prog = append(prog, task.TaskTypeObserveSidecarTLS) } prog = append(prog, diff --git a/internal/planner/planner.go b/internal/planner/planner.go index dc9a526..bb68ad9 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -555,13 +555,14 @@ func buildBasePlan( prog = append(prog, task.TaskTypeValidateOperatorKeyring) } if noderesource.SidecarTLSEnabled(node) { - // ApplyRBACProxyConfig must precede ApplyStatefulSet (proxy - // mounts the ConfigMap). ObserveSidecarTLS must precede the - // sidecar HTTP progression so SidecarURLForNode picks up the - // TLS transport before SnapshotRestore / ConfigApply / MarkReady. - prog = append(prog, task.TaskTypeApplyRBACProxyConfig) - prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) - prog = append(prog, task.TaskTypeObserveSidecarTLS) + // ConfigMap before StatefulSet (proxy mounts it); observer + // before sidecar HTTP tasks (transport flip). + prog = append(prog, + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeObserveSidecarTLS, + ) } else { prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) } @@ -733,9 +734,7 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { return out } -// buildRunningPlan returns a steady-state drift plan, or nil if no drift. -// Image drift and TLS-enable drift co-compose into one NodeUpdate plan so -// a single pod cycle covers both. +// buildRunningPlan returns a drift plan, or nil if none applies. func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { if node.Spec.Image != node.Status.CurrentImage || sidecarTLSEnableDrift(node) { return buildNodeUpdatePlan(node) @@ -746,11 +745,8 @@ func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) return nil, nil } -// sidecarTLSEnableDrift reports whether the operator added spec.sidecar.tls -// post-creation and the live pod hasn't yet been rolled with TLS. -// Returns false unless the preflight has validated the Secret — the -// ResolvePlan gate also blocks plan creation when the condition is False, -// so this is belt-and-suspenders. +// sidecarTLSEnableDrift fires when spec.sidecar.tls is set, the pod +// hasn't been rolled with TLS yet, and preflight has validated the Secret. func sidecarTLSEnableDrift(node *seiv1alpha1.SeiNode) bool { if node.Spec.Sidecar == nil || node.Spec.Sidecar.TLS == nil { return false @@ -780,13 +776,8 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// buildNodeUpdatePlan constructs a plan to roll out an image update, -// a TLS-enable transition, or both on a Running node. The plan applies -// the regenerated StatefulSet spec, cycles the pod, observes whichever -// fields drifted, then re-initializes the sidecar. -// -// FailedPhase is deliberately empty: a failure retries on the next reconcile -// rather than transitioning the node out of Running. +// buildNodeUpdatePlan composes image-drift, TLS-enable drift, or both. +// FailedPhase is empty: failures retry, not transition out of Running. func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { imageDrift := node.Spec.Image != node.Status.CurrentImage tlsEnable := sidecarTLSEnableDrift(node) @@ -795,10 +786,6 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro var prog []string if tlsEnable { - // The kube-rbac-proxy authz ConfigMap is controller-owned and - // must exist before ApplyStatefulSet schedules the pod with the - // proxy container mount. The TLS Secret itself is operator- - // provisioned and already validated by preflight. prog = append(prog, task.TaskTypeApplyRBACProxyConfig) } prog = append(prog, diff --git a/internal/planner/sidecar_tls_enable_test.go b/internal/planner/sidecar_tls_enable_test.go index 76a33f8..86c2f69 100644 --- a/internal/planner/sidecar_tls_enable_test.go +++ b/internal/planner/sidecar_tls_enable_test.go @@ -30,7 +30,6 @@ func tlsEnableRunningNode() *seiv1alpha1.SeiNode { Status: seiv1alpha1.SeiNodeStatus{ Phase: seiv1alpha1.PhaseRunning, CurrentImage: tlsImageV1, - // CurrentSidecarTLSSecretName intentionally empty — TLS just enabled. }, } } @@ -98,7 +97,7 @@ func TestBuildRunningPlan_TLSEnableOnly(t *testing.T) { func TestBuildRunningPlan_ImageAndTLSEnable_CoCompose(t *testing.T) { g := NewWithT(t) node := tlsEnableRunningNode() - node.Spec.Image = tlsImageV2 // image drift on top of TLS enable + node.Spec.Image = tlsImageV2 setTLSSecretReady2(node, metav1.ConditionTrue) plan, err := buildRunningPlan(node) @@ -114,18 +113,14 @@ func TestBuildRunningPlan_ImageAndTLSEnable_CoCompose(t *testing.T) { task.TaskTypeObserveImage, task.TaskTypeObserveSidecarTLS, TaskMarkReady, - }), "both observers run in one plan; single pod cycle covers both drifts") + })) } func TestBuildRunningPlan_ImageOnlyDrift_TLSAlreadyConverged(t *testing.T) { - // Image drift on a node whose TLS is already converged. TLS-enable - // drift detector returns false (status mirrors spec), so the plan - // omits ApplyRBACProxyConfig and ObserveSidecarTLS — the existing - // ConfigMap and status mirror stand from the prior plan. g := NewWithT(t) node := tlsEnableRunningNode() - node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName // already converged - node.Spec.Image = tlsImageV2 // image drift only + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + node.Spec.Image = tlsImageV2 setTLSSecretReady2(node, metav1.ConditionTrue) plan, err := buildRunningPlan(node) @@ -157,7 +152,6 @@ func TestResolvePlan_TLSEnable_SetsAndClearsCondition(t *testing.T) { g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) g.Expect(cond.Message).To(ContainSubstring("tls enable")) - // Simulate plan completion + observer stamping the status mirror. node.Status.Plan.Phase = seiv1alpha1.TaskPlanComplete node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName diff --git a/internal/planner/sidecar_tls_test.go b/internal/planner/sidecar_tls_test.go index 0034039..e70d15a 100644 --- a/internal/planner/sidecar_tls_test.go +++ b/internal/planner/sidecar_tls_test.go @@ -38,10 +38,8 @@ func TestSidecarURLForNode_TLSRoutesThroughProxyHTTPS(t *testing.T) { } } -// TestSidecarURLForNode_TLSSpecButNotYetObserved exercises the mid-rollout -// transport state: spec.sidecar.tls is set but the pod hasn't been cycled -// (status.currentSidecarTLSSecretName empty). Controller MUST use plain -// HTTP against the still-HTTP pod until the rollout converges. +// TestSidecarURLForNode_TLSSpecButNotYetObserved: spec set, status empty +// → HTTP until ObserveSidecarTLS stamps the mirror. func TestSidecarURLForNode_TLSSpecButNotYetObserved(t *testing.T) { node := &seiv1alpha1.SeiNode{ ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, @@ -53,15 +51,12 @@ func TestSidecarURLForNode_TLSSpecButNotYetObserved(t *testing.T) { got := SidecarURLForNode(node) want := fmt.Sprintf("http://%s-0.%s.sei.svc.cluster.local:7777", testValidatorName, testValidatorName) if got != want { - t.Errorf("mid-rollout URL = %q, want %q (must stay HTTP until ObserveSidecarTLS stamps status)", got, want) + t.Errorf("URL = %q, want %q", got, want) } } -// TestSidecarURLForNode_StatusSetWithoutSpec exercises an edge case where -// the status mirror is non-empty but spec.sidecar.tls is nil (e.g., a -// hypothetical future disable transition mid-flight). The URL function -// reads observed state only, so the controller still routes through TLS -// until the next observer cycle clears the status mirror. +// TestSidecarURLForNode_StatusSetWithoutSpec: status drives transport, +// not spec. func TestSidecarURLForNode_StatusSetWithoutSpec(t *testing.T) { node := &seiv1alpha1.SeiNode{ ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, @@ -70,7 +65,7 @@ func TestSidecarURLForNode_StatusSetWithoutSpec(t *testing.T) { got := SidecarURLForNode(node) want := fmt.Sprintf("https://%s-0.%s.sei.svc.cluster.local:8443", testValidatorName, testValidatorName) if got != want { - t.Errorf("URL = %q, want %q (status mirror is the source of truth)", got, want) + t.Errorf("URL = %q, want %q", got, want) } } diff --git a/internal/task/observe_sidecar_tls.go b/internal/task/observe_sidecar_tls.go index 9437d04..fc067a2 100644 --- a/internal/task/observe_sidecar_tls.go +++ b/internal/task/observe_sidecar_tls.go @@ -42,10 +42,8 @@ func deserializeObserveSidecarTLS(id string, params json.RawMessage, cfg Executi }, nil } -// Execute polls the StatefulSet rollout. Once converged, stamps -// status.currentSidecarTLSSecretName from spec so the controller's -// sidecar client transport-mode selection (SidecarURLForNode) picks -// up the new live state before the plan's MarkReady call fires. +// Execute polls StatefulSet rollout convergence and stamps +// status.currentSidecarTLSSecretName from spec on success. func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) if err != nil { @@ -70,11 +68,8 @@ func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { if sts.Spec.Replicas == nil { return nil } - // Stricter than ObserveImage: require ReadyReplicas (kube-rbac-proxy - // passed its readiness probe), not just UpdatedReplicas (template - // applied). Stamping the status mirror flips SidecarURLForNode to - // HTTPS; if the proxy is still crash-looping, MarkReady would hit - // connection-refused. Gate on actual proxy serving. + // Require ReadyReplicas (proxy passed its readiness probe), not + // just UpdatedReplicas — stamping flips transport to HTTPS. if sts.Status.UpdatedReplicas < *sts.Spec.Replicas || sts.Status.ReadyReplicas < *sts.Spec.Replicas { return nil } diff --git a/internal/task/observe_sidecar_tls_test.go b/internal/task/observe_sidecar_tls_test.go index 0574abf..d5c21e8 100644 --- a/internal/task/observe_sidecar_tls_test.go +++ b/internal/task/observe_sidecar_tls_test.go @@ -82,12 +82,8 @@ func TestObserveSidecarTLS_RolloutComplete_StampsCurrent(t *testing.T) { g.Expect(node.Status.CurrentSidecarTLSSecretName).To(Equal("my-tls-cert")) } -// TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning is the -// crashlooping-proxy scenario: kubelet has applied the new pod template -// (UpdatedReplicas=1) but the proxy container's readiness probe is failing -// (ReadyReplicas=0). The observer must NOT stamp status — otherwise -// SidecarURLForNode would flip to HTTPS and MarkReady would hit -// connection-refused against the unready proxy. +// TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning: +// UpdatedReplicas=1 but ReadyReplicas=0 must not stamp status. func TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning(t *testing.T) { g := NewWithT(t) node := observeTLSNode("my-tls-cert")