diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index e9444c7..b19b797 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -10,7 +10,7 @@ import ( // the populated field determines the node's operating mode. // +kubebuilder:validation:XValidation:rule="(has(self.fullNode) ? 1 : 0) + (has(self.archive) ? 1 : 0) + (has(self.replayer) ? 1 : 0) + (has(self.validator) ? 1 : 0) == 1",message="exactly one of fullNode, archive, replayer, or validator must be set" // +kubebuilder:validation:XValidation:rule="!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)",message="peers is required when replayer mode is set" -// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is immutable; delete + recreate the SeiNode to change TLS configuration" +// +kubebuilder:validation:XValidation:rule="(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)",message="spec.sidecar.tls is set-once; disable and swap require delete + recreate" type SeiNodeSpec struct { // ChainID of the chain this node belongs to. // +kubebuilder:validation:MinLength=1 @@ -362,6 +362,11 @@ type SeiNodeStatus struct { // SidecarTLS is set when spec.sidecar.tls is non-nil. // +optional SidecarTLS *SidecarTLSStatus `json:"sidecarTLS,omitempty"` + + // CurrentSidecarTLSSecretName is the secretName observed on the + // live pod. Empty when TLS hasn't rolled out yet. + // +optional + CurrentSidecarTLSSecretName string `json:"currentSidecarTLSSecretName,omitempty"` } // SidecarTLSStatus is the controller's view of the referenced TLS Secret. diff --git a/cmd/main.go b/cmd/main.go index ed67e56..b67196b 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -176,7 +176,8 @@ func main() { newSidecarClient := func(node *seiv1alpha1.SeiNode) (*sidecar.SidecarClient, error) { url := noderesource.SidecarURLForNode(node) - if noderesource.SidecarTLSEnabled(node) { + // Same observed-state predicate as SidecarURLForNode. + if node.Status.CurrentSidecarTLSSecretName != "" { return sidecar.NewSidecarClient(url, sidecar.WithHTTPDoer(sharedTLSDoer)) } return sidecar.NewSidecarClient(url) diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 3f564f2..7760100 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -895,11 +895,12 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the - SeiNode to change TLS configuration + - message: 'spec.sidecar.tls is set-once: enabling on an existing + SeiNode is allowed, but disabling or changing requires delete + + recreate' rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) - ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) - && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)' + ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls + == oldSelf.sidecar.tls)' required: - spec type: object diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 421e1c4..dd848aa 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -766,11 +766,11 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the SeiNode - to change TLS configuration - rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) - || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) - && self.sidecar.tls == oldSelf.sidecar.tls)' + - message: 'spec.sidecar.tls is set-once: enabling on an existing SeiNode + is allowed, but disabling or changing requires delete + recreate' + rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : + (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == + oldSelf.sidecar.tls)' status: description: SeiNodeStatus defines the observed state of a SeiNode. properties: @@ -841,6 +841,13 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLSSecretName: + description: |- + CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName + observed on the live pod after the rollout converged. Drives the + controller's sidecar client transport-mode selection (HTTP vs + HTTPS). Empty when no TLS rollout has converged yet. + type: string externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node, diff --git a/docs/design-seinode-sidecar-tls-toggle-lld.md b/docs/design-seinode-sidecar-tls-toggle-lld.md index 6bc5816..89074c3 100644 --- a/docs/design-seinode-sidecar-tls-toggle-lld.md +++ b/docs/design-seinode-sidecar-tls-toggle-lld.md @@ -15,28 +15,29 @@ The first attempt at supporting TLS toggle on Running nodes (PR #254) added a dr This revision drops that approach entirely. The cert is no longer the controller's concern; it becomes platform-provisioned, mirroring how `spec.validator.signingKey`, `spec.validator.nodeKey`, `spec.validator.operatorKeyring`, and `spec.dataVolume.import.pvcName` already work. -## 0.1 Design choice: externalize cert ownership; spec is immutable +## 0.1 Design choice: externalize cert ownership; spec is set-once -Three coupled decisions, each enabling the next: +Three coupled decisions: **a) The controller does not provision the cert.** Operator/platform tooling (cert-manager `Certificate` in GitOps, or any other source) creates the `kubernetes.io/tls` Secret. The controller references it by name and gates progression on presence + cert validity. This matches every other security-sensitive resource in the SeiNode spec (`signingKey`, `nodeKey`, `operatorKeyring`, imported `PVC`). -**b) `spec.sidecar.tls` is immutable post-creation.** Toggling TLS on an existing SeiNode is *not* a controller-orchestrated transition — it's a delete + recreate. The data PVC is retained externally (today via `spec.dataVolume.import.pvcName` pre-detach; future-cleaner via `spec.dataVolume.retainOnDelete`). At ~10 nodes for the foreseeable internal fleet, manual cycling is operationally fine and gives each transition a clean Pending → Running boundary instead of a mid-flight transport-mode flip. +**b) `spec.sidecar.tls` is set-once.** Once set, the value is immutable: disable (`set → nil`) and swap (`A → B`) are rejected at admission via CEL. The one allowed transition is **enable post-creation** (`nil → set`), which triggers a NodeUpdate-style plan that cycles the pod with the proxy mount. This decision evolved across two PRs: #258 made the spec fully immutable on the rationale that delete + recreate at ~10 nodes was operationally fine; #262 un-deferred the enable case after operator validation that per-node delete + recreate is more friction than warranted, but kept disable and swap blocked because they remain genuinely irreversible (swap in particular implies a new trust anchor, which warrants a clean Pending → Running boundary). The transport-mode-flip-during-cycle concern is addressed by sourcing transport selection from `status.currentSidecarTLSSecretName` (observed) rather than spec. **c) The cert→SeiNode contract is machine-readable, not docs-only.** The controller publishes `status.sidecarTLS.{secretName, requiredDNSNames}` whenever TLS is enabled. Platform tooling reads this directly rather than re-deriving the K8s service DNS convention from documentation. The controller validates the Secret's cert SANs against this list before allowing the pod to schedule. -What collapses under these decisions: -- The drift detector (`sidecarTLSDrift`) — gone; immutable spec means no drift on Running nodes. -- The status mirror (`status.currentSidecarTLS`) — gone; immutable spec means status = spec for TLS. -- The condition reason expansion (`TLSToggleStarted`, `UpdateAndTLSToggleStarted`) — gone; toggle isn't a NodeUpdate. -- `classifyPlan` discrimination — back to its pre-PR shape. -- The `ApplySidecarCert` task and `GenerateSidecarCertificate` resource generator — gone; cert is external. -- The `ObserveSidecarTLS` task — gone; nothing to observe on a Running node. -- The Issuer trust-anchor security surface (`SidecarTLSSpec.IssuerName`/`IssuerKind`) — gone; trust-anchor selection happens entirely in platform tooling, outside the controller's blast radius. +**d) Status is the source of truth for transport selection.** `SidecarURLForNode` reads `status.currentSidecarTLSSecretName` (observed), not `spec.sidecar.tls` (desired). The controller's sidecar client transport (cmd/main.go `newSidecarClient`) keys off the same observed field. Mid-rollout, the controller talks HTTP to the still-HTTP pod; after `ObserveSidecarTLS` stamps the mirror, both URL and transport flip together. This is the load-bearing fix for the windowing bug that bit the original PR #254 attempt. -What stays: -- `noderesource` pod/service generation with the existing `if SidecarTLSEnabled(node) { ... add proxy }` branching. -- `ApplyRBACProxyConfig` task (the kube-rbac-proxy authz ConfigMap is controller-owned and depends on SeiNode namespace/name). +**Load-bearing trust assumption:** the controller's `SidecarURLForNode` consumes `status.currentSidecarTLSSecretName`. Any actor with `seinodes/status` patch permission could flip the controller's transport selection (DoS on the reconcile loop, not data exfil — the sidecar still enforces its own auth). The existing RBAC (`manifests/role.yaml`) scopes status-write to the controller's own ServiceAccount only. This is canonical for controller-runtime managers but is now load-bearing; an admission-time check (ValidatingAdmissionPolicy) preventing non-controller writes to this field would be defense in depth. + +What lives in the controller under these decisions: +- Drift detector `sidecarTLSEnableDrift` — narrow: spec set + status empty + preflight `SidecarTLSSecretReady=True`. Disable and swap aren't detector-handled because CEL blocks them at admission. +- Status mirror `status.currentSidecarTLSSecretName` — drives transport selection; stamped only on rollout convergence (`StatefulSet.Status.ReadyReplicas >= *Replicas`) so a crashlooping proxy doesn't flip transport prematurely. +- `ObserveSidecarTLS` task — polls StatefulSet rollout and stamps the mirror; injected before any sidecar HTTP task in every plan that brings up a TLS-enabled pod. +- `ApplyRBACProxyConfig` task — controller-owned (the ConfigMap's SAR depends on namespace/name). + +What stays externalized: +- The `kubernetes.io/tls` Secret containing cert + key. +- Trust-anchor selection (Issuer / ClusterIssuer reference) — happens entirely in whatever produces the Secret. ## 1. CRD schema changes @@ -45,13 +46,9 @@ What stays: // SidecarTLSSpec, if set, fronts the sidecar API with kube-rbac-proxy // on :8443 using TLS material from a Secret in the SeiNode's -// namespace. The Secret is operator-provisioned (e.g., via a -// cert-manager Certificate in the platform GitOps repo); this -// controller does not create it. -// -// Immutable post-creation. Toggling TLS on an existing SeiNode is a -// delete + recreate operation; data persists via the SeiNode's PVC -// retention mechanism. +// namespace. Operator-provisioned; the controller does not create it. +// Set-once: enabling on an existing SeiNode is allowed (triggers a +// NodeUpdate-style pod cycle); disable and swap are CEL-rejected. // // +optional TLS *SidecarTLSSpec `json:"tls,omitempty"` @@ -191,33 +188,37 @@ There is no `WaitForSidecarTLSSecret` task in the plan — its job is absorbed i ## 4. Running-state behavior -`buildRunningPlan` reverts to its pre-PR-#254 shape: image drift + sidecar re-approval, no TLS handling. There is no TLS drift to detect. +`buildRunningPlan` detects two drifts: image drift (existing) and TLS-enable drift (added via #262). Both compose into one NodeUpdate plan when they co-fire; a single pod cycle covers both. -If an operator attempts to mutate `spec.sidecar.tls`, the CRD CEL rejects the API request — no controller code runs. +TLS-enable drift fires only on the `nil → set` transition (operator added `spec.sidecar.tls` to an existing SeiNode that previously had none). Disable and swap are CEL-rejected at admission; the detector never sees them. The plan composes `ApplyRBACProxyConfig → ApplyStatefulSet → ApplyService → ReplacePod → [ObserveImage if image drift] → ObserveSidecarTLS → MarkReady`. `ObserveSidecarTLS` polls StatefulSet rollout convergence (requires `ReadyReplicas >= *Replicas` so a crashlooping proxy doesn't flip transport prematurely), then stamps `status.currentSidecarTLSSecretName`. If the externally-provisioned Secret rotates in place (cert-manager renewal with the same SANs): kube-rbac-proxy's existing `--tls-reload-interval=30s` flag picks up the new material from the same Secret mount; no pod restart needed; no controller action. Pre-flight re-runs each reconcile and continues stamping `SidecarTLSSecretReady=True`. -If the Secret SAN coverage breaks (wrong SANs after a misconfigured re-issuance, Secret deleted, etc.): pre-flight flips the condition to `SidecarTLSSecretReady=False`. Plan creation is gated — any plan that fires under a broken Secret would eventually cycle the pod (image-drift NodeUpdate plans always do; even a mark-ready replan ultimately retries the sidecar HTTP call through the proxy), and a cycled pod with a missing or mis-SAN'd Secret won't bind cleanly. The Running node keeps serving on whatever cert kube-rbac-proxy is already bound to (no controller action can force a reload-from-bad-Secret), so the user-visible effect is "running pod stays as-is, no new rollouts until the operator fixes the Secret." +If the Secret SAN coverage breaks (wrong SANs after a misconfigured re-issuance, Secret deleted, etc.): pre-flight flips the condition to `SidecarTLSSecretReady=False`. Plan creation is gated for any TLS-required SeiNode whose condition is not Ready. The Running node keeps serving on whatever cert kube-rbac-proxy is already bound to (no controller action can force a reload-from-bad-Secret), so the user-visible effect is "running pod stays as-is, no new rollouts until the operator fixes the Secret." + +**Mid-rollout transport invariant:** `SidecarURLForNode` reads `status.currentSidecarTLSSecretName`; cmd/main.go's `newSidecarClient` keys its HTTP-doer choice on the same field. The two move in lockstep. During the enable rollout, both stay on HTTP while the old pod still listens on `:7777`; once `ObserveSidecarTLS` stamps the mirror, both flip to HTTPS for the new pod. + +## 5. Operator workflow -## 5. Operator workflow for "enable TLS on existing fleet" +**Enable TLS at SeiNode creation:** apply a SeiNode with `spec.sidecar.tls.secretName` referencing a pre-provisioned `kubernetes.io/tls` Secret. SeiNode stays `Pending` with `SidecarTLSSecretReady=False/NotFound` until the Secret resolves; init plan runs once the condition flips to True. -The arctic-1 / atlantic-2 / pacific-1 use case: +**Enable TLS on an existing Running SeiNode** (the arctic-1 / atlantic-2 / pacific-1 fleet path): -1. Platform tooling pre-provisions a `kubernetes.io/tls` Secret per SeiNode (cert-manager `Certificate` resources, GitOps-managed) with SANs matching the published `status.sidecarTLS.requiredDNSNames` — readable in advance from a dry-run of the target spec or from a known template. -2. Per SeiNode (one at a time, governed by ops policy): - a. Detach the data PVC (existing `spec.dataVolume.import.pvcName` flow, or set `spec.dataVolume.retainOnDelete: true` if that lands first). - b. Delete the SeiNode. - c. Create a new SeiNode with the same name, `spec.sidecar.tls.secretName` set, and `spec.dataVolume.import.pvcName` pointing at the retained PVC. - d. Wait for `Phase=Running`. +1. Platform tooling provisions a `kubernetes.io/tls` Secret with SANs matching `status.sidecarTLS.requiredDNSNames` (pre-computable as `{name}.{ns}.svc.cluster.local` + `{name}-0.{name}.{ns}.svc.cluster.local`). +2. Operator patches `spec.sidecar.tls.secretName` on the running SeiNode. +3. Preflight validates the Secret on the next reconcile; condition flips to True. +4. `sidecarTLSEnableDrift` fires; planner builds a NodeUpdate plan composing the proxy `ConfigMap` apply + StatefulSet/Service regen + pod cycle + observer + MarkReady. +5. SeiNode stays in `Phase=Running` throughout; transport selection lags spec until `ObserveSidecarTLS` confirms `ReadyReplicas`. -At ~10 nodes total, this is a one-time script. No SND-level orchestration changes needed; the SND already supports its existing rollout primitives. +**Disable or swap:** delete + recreate the SeiNode. PVC retention via existing `spec.dataVolume.import.pvcName` (pre-detach) or future `spec.dataVolume.retainOnDelete`. Disable and swap remain CEL-rejected on existing objects because they require a clean Pending → Running boundary that mid-flight orchestration cannot provide. ## 6. Out of scope (deferred or unrelated) - **`spec.dataVolume.retainOnDelete: bool`** as a one-step PVC retention primitive (replaces the two-step pre-detach workflow). Small follow-up issue. Not blocking. - **SND-level retain-on-delete** ("delete the SND, keep the SeiNodes") — independently useful as a "drop the orchestrator, keep the fleet" escape hatch but unrelated to this LLD. Separate follow-up if desired. -- **In-place TLS toggle on Running SeiNodes** — explicitly rejected in §0.1(b). Re-open as a separate design if the fleet scale or operational model changes such that delete + recreate is no longer feasible. +- **In-place TLS disable or swap on Running SeiNodes** — CEL-rejected. Re-open as a separate design only if a coherent rollout path exists; today neither has one (disable would leave the proxy in place mid-flight serving stale state; swap implies a trust-anchor change that warrants Pending → Running). - **CA rotation observability** (cert fingerprint, SecretResourceVersion in status) — moot under the externalized-cert model; cert content is not the controller's concern. +- **Admission-policy hardening of `status.currentSidecarTLSSecretName` writes.** The current trust assumption is that only the controller's ServiceAccount has `seinodes/status` patch permission. A ValidatingAdmissionPolicy preventing non-controller writes to this specific field would be defense in depth. Tracked separately. ## 7. Cross-cutting (operational notes) diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 17f502a..0da2d31 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -1155,15 +1155,12 @@ func GenerateRBACProxyConfigMap(node *seiv1alpha1.SeiNode) *corev1.ConfigMap { } } -// SidecarURLForNode builds the in-cluster URL the controller uses to -// reach a node's sidecar. TLS-enabled nodes route through the -// kube-rbac-proxy on HTTPS :8443; otherwise plain HTTP on the -// sidecar's own port. Single source of truth — both the planner -// (for plan-execution clients) and deployment-await tasks call -// through here. +// SidecarURLForNode returns the in-cluster URL for a node's sidecar. +// HTTPS :8443 via kube-rbac-proxy when status reports TLS live; +// otherwise plain HTTP on the sidecar port. func SidecarURLForNode(node *seiv1alpha1.SeiNode) string { scheme, port := "http", SidecarPort(node) - if SidecarTLSEnabled(node) { + if node.Status.CurrentSidecarTLSSecretName != "" { scheme, port = "https", RBACProxyPort } return fmt.Sprintf("%s://%s-0.%s.%s.svc.cluster.local:%d", diff --git a/internal/planner/bootstrap.go b/internal/planner/bootstrap.go index be1853b..b49f79b 100644 --- a/internal/planner/bootstrap.go +++ b/internal/planner/bootstrap.go @@ -112,6 +112,14 @@ func buildBootstrapPlan( &task.ApplyServiceParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { return nil, err } + if noderesource.SidecarTLSEnabled(node) { + // Phase-2 tasks ran against the bootstrap pod (HTTP, no proxy); + // this stamps the status mirror so Phase-5 tasks use HTTPS. + if err := appendTask(task.TaskTypeObserveSidecarTLS, + &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace}); err != nil { + return nil, err + } + } // Phase 5: Post-bootstrap config on StatefulSet pod for _, taskType := range postProg { @@ -188,6 +196,11 @@ func buildGenesisPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, + ) + if noderesource.SidecarTLSEnabled(node) { + prog = append(prog, task.TaskTypeObserveSidecarTLS) + } + prog = append(prog, TaskGenerateIdentity, TaskGenerateGentx, TaskUploadGenesisArtifacts, diff --git a/internal/planner/node_update_test.go b/internal/planner/node_update_test.go index 333bc66..fa04268 100644 --- a/internal/planner/node_update_test.go +++ b/internal/planner/node_update_test.go @@ -97,7 +97,7 @@ func TestResolvePlan_NodeUpdate_SetsCondition(t *testing.T) { g.Expect(cond).NotTo(BeNil(), "NodeUpdateInProgress condition should be set") g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) g.Expect(cond.Reason).To(Equal("UpdateStarted")) - g.Expect(cond.Message).To(ContainSubstring("image drift detected")) + g.Expect(cond.Message).To(ContainSubstring("image drift")) } func TestResolvePlan_CompletedPlan_ClearsCondition(t *testing.T) { diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 08a7b25..bb68ad9 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -220,14 +220,28 @@ func setNodeUpdateCondition(node *seiv1alpha1.SeiNode, status metav1.ConditionSt // classifyPlan returns the plan type for metrics. func classifyPlan(plan *seiv1alpha1.TaskPlan) string { + var hasImage, hasTLS, hasInit bool for _, t := range plan.Tasks { switch t.Type { case task.TaskTypeObserveImage: - return "node-update" + hasImage = true + case task.TaskTypeObserveSidecarTLS: + hasTLS = true case task.TaskTypeEnsureDataPVC: - return "init" + hasInit = true } } + if hasInit { + return "init" + } + switch { + case hasImage && hasTLS: + return "node-update-tls-enable" + case hasImage: + return "node-update" + case hasTLS: + return "tls-enable" + } if len(plan.Tasks) == 1 && plan.Tasks[0].Type == sidecar.TaskTypeMarkReady { return "mark-ready-reapply" } @@ -541,9 +555,17 @@ func buildBasePlan( prog = append(prog, task.TaskTypeValidateOperatorKeyring) } if noderesource.SidecarTLSEnabled(node) { - prog = append(prog, task.TaskTypeApplyRBACProxyConfig) + // ConfigMap before StatefulSet (proxy mounts it); observer + // before sidecar HTTP tasks (transport flip). + prog = append(prog, + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeObserveSidecarTLS, + ) + } else { + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) } - prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService) prog = append(prog, sidecarProg...) planID := uuid.New().String() @@ -588,6 +610,8 @@ func paramsForTaskType( return &task.ReplacePodParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeObserveImage: return &task.ObserveImageParams{NodeName: node.Name, Namespace: node.Namespace} + case task.TaskTypeObserveSidecarTLS: + return &task.ObserveSidecarTLSParams{NodeName: node.Name, Namespace: node.Namespace} case task.TaskTypeValidateSigningKey: return validateSigningKeyParams(node) case task.TaskTypeValidateNodeKey: @@ -710,11 +734,9 @@ func commonOverrides(node *seiv1alpha1.SeiNode) map[string]string { return out } -// buildRunningPlan returns a steady-state drift plan, or nil if no drift. -// Image drift is checked first — its plan ends with MarkReady, so it also -// resolves any stale sidecar. +// buildRunningPlan returns a drift plan, or nil if none applies. func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { - if node.Spec.Image != node.Status.CurrentImage { + if node.Spec.Image != node.Status.CurrentImage || sidecarTLSEnableDrift(node) { return buildNodeUpdatePlan(node) } if sidecarNeedsReapproval(node) { @@ -723,6 +745,18 @@ func buildRunningPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) return nil, nil } +// sidecarTLSEnableDrift fires when spec.sidecar.tls is set, the pod +// hasn't been rolled with TLS yet, and preflight has validated the Secret. +func sidecarTLSEnableDrift(node *seiv1alpha1.SeiNode) bool { + if node.Spec.Sidecar == nil || node.Spec.Sidecar.TLS == nil { + return false + } + if node.Status.CurrentSidecarTLSSecretName == node.Spec.Sidecar.TLS.SecretName { + return false + } + return sidecarTLSSecretReady(node) +} + func sidecarNeedsReapproval(node *seiv1alpha1.SeiNode) bool { cond := meta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionSidecarReady) return cond != nil && cond.Status == metav1.ConditionFalse && cond.Reason == "NotReady" @@ -742,23 +776,30 @@ func buildMarkReadyPlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error }, nil } -// buildNodeUpdatePlan constructs a plan to roll out an image update on a -// Running node. The plan applies the new StatefulSet spec, waits for the -// rollout to complete, then re-initializes the sidecar. -// -// FailedPhase is deliberately empty: a failure retries on the next reconcile -// rather than transitioning the node out of Running. +// buildNodeUpdatePlan composes image-drift, TLS-enable drift, or both. +// FailedPhase is empty: failures retry, not transition out of Running. func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, error) { + imageDrift := node.Spec.Image != node.Status.CurrentImage + tlsEnable := sidecarTLSEnableDrift(node) setNodeUpdateCondition(node, metav1.ConditionTrue, "UpdateStarted", - fmt.Sprintf("image drift detected: spec=%s current=%s", node.Spec.Image, node.Status.CurrentImage)) + nodeUpdateMessage(node, imageDrift, tlsEnable)) - prog := []string{ + var prog []string + if tlsEnable { + prog = append(prog, task.TaskTypeApplyRBACProxyConfig) + } + prog = append(prog, task.TaskTypeApplyStatefulSet, task.TaskTypeApplyService, task.TaskTypeReplacePod, - task.TaskTypeObserveImage, - sidecar.TaskTypeMarkReady, + ) + if imageDrift { + prog = append(prog, task.TaskTypeObserveImage) + } + if tlsEnable { + prog = append(prog, task.TaskTypeObserveSidecarTLS) } + prog = append(prog, sidecar.TaskTypeMarkReady) planID := uuid.New().String() tasks := make([]seiv1alpha1.PlannedTask, len(prog)) @@ -777,6 +818,19 @@ func buildNodeUpdatePlan(node *seiv1alpha1.SeiNode) (*seiv1alpha1.TaskPlan, erro }, nil } +func nodeUpdateMessage(node *seiv1alpha1.SeiNode, imageDrift, tlsEnable bool) string { + switch { + case imageDrift && tlsEnable: + return fmt.Sprintf("image drift: spec=%s current=%s; tls enable: secretName=%s", + node.Spec.Image, node.Status.CurrentImage, node.Spec.Sidecar.TLS.SecretName) + case tlsEnable: + return fmt.Sprintf("tls enable: secretName=%s", node.Spec.Sidecar.TLS.SecretName) + default: + return fmt.Sprintf("image drift: spec=%s current=%s", + node.Spec.Image, node.Status.CurrentImage) + } +} + // mergeOverrides combines controller-generated overrides with user-specified // overrides. User overrides take precedence. func mergeOverrides(controllerOverrides, userOverrides map[string]string) map[string]string { diff --git a/internal/planner/sidecar_tls_enable_test.go b/internal/planner/sidecar_tls_enable_test.go new file mode 100644 index 0000000..86c2f69 --- /dev/null +++ b/internal/planner/sidecar_tls_enable_test.go @@ -0,0 +1,231 @@ +package planner + +import ( + "slices" + "testing" + + . "github.com/onsi/gomega" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/task" +) + +const ( + tlsEnableSecretName = "validator-0-tls" + tlsImageV1 = "sei:v1.0.0" + tlsImageV2 = "sei:v2.0.0" +) + +func tlsEnableRunningNode() *seiv1alpha1.SeiNode { + return &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + Sidecar: &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: tlsEnableSecretName}}, + }, + Status: seiv1alpha1.SeiNodeStatus{ + Phase: seiv1alpha1.PhaseRunning, + CurrentImage: tlsImageV1, + }, + } +} + +func setTLSSecretReady2(node *seiv1alpha1.SeiNode, status metav1.ConditionStatus) { + apimeta.SetStatusCondition(&node.Status.Conditions, metav1.Condition{ + Type: seiv1alpha1.ConditionSidecarTLSSecretReady, Status: status, Reason: seiv1alpha1.ReasonTLSSecretReady, + }) +} + +// --- sidecarTLSEnableDrift detector --- + +func TestSidecarTLSEnableDrift_FiresWhenSpecSetAndStatusEmptyAndReady(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + g.Expect(sidecarTLSEnableDrift(node)).To(BeTrue()) +} + +func TestSidecarTLSEnableDrift_FalseWhenAlreadyConverged(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + setTLSSecretReady2(node, metav1.ConditionTrue) + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse()) +} + +func TestSidecarTLSEnableDrift_FalseWhenPreflightNotReady(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionFalse) + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse(), + "detector must wait for preflight to validate the Secret before triggering a plan") +} + +func TestSidecarTLSEnableDrift_FalseWhenTLSDisabled(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Sidecar = nil + g.Expect(sidecarTLSEnableDrift(node)).To(BeFalse()) +} + +// --- buildRunningPlan composition --- + +func TestBuildRunningPlan_TLSEnableOnly(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + g.Expect(types).To(Equal([]string{ + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + })) +} + +func TestBuildRunningPlan_ImageAndTLSEnable_CoCompose(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Image = tlsImageV2 + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + g.Expect(types).To(Equal([]string{ + task.TaskTypeApplyRBACProxyConfig, + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + task.TaskTypeObserveSidecarTLS, + TaskMarkReady, + })) +} + +func TestBuildRunningPlan_ImageOnlyDrift_TLSAlreadyConverged(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + node.Spec.Image = tlsImageV2 + setTLSSecretReady2(node, metav1.ConditionTrue) + + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + g.Expect(taskTypes(plan)).To(Equal([]string{ + task.TaskTypeApplyStatefulSet, + task.TaskTypeApplyService, + task.TaskTypeReplacePod, + task.TaskTypeObserveImage, + TaskMarkReady, + })) +} + +// --- ResolvePlan condition lifecycle on TLS enable --- + +func TestResolvePlan_TLSEnable_SetsAndClearsCondition(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + + err := (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).NotTo(BeNil()) + + cond := apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(cond.Message).To(ContainSubstring("tls enable")) + + node.Status.Plan.Phase = seiv1alpha1.TaskPlanComplete + node.Status.CurrentSidecarTLSSecretName = tlsEnableSecretName + + err = (&NodeResolver{}).ResolvePlan(t.Context(), node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.Plan).To(BeNil()) + + cond = apimeta.FindStatusCondition(node.Status.Conditions, seiv1alpha1.ConditionNodeUpdateInProgress) + g.Expect(cond).NotTo(BeNil()) + g.Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(cond.Reason).To(Equal("UpdateComplete")) +} + +// --- classifyPlan labels for the new variants --- + +func TestClassifyPlan_TLSEnableOnly(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + setTLSSecretReady2(node, metav1.ConditionTrue) + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("tls-enable")) +} + +func TestClassifyPlan_ImageAndTLSEnable(t *testing.T) { + g := NewWithT(t) + node := tlsEnableRunningNode() + node.Spec.Image = tlsImageV2 + setTLSSecretReady2(node, metav1.ConditionTrue) + plan, err := buildRunningPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(classifyPlan(plan)).To(Equal("node-update-tls-enable")) +} + +// --- init-plan ObserveSidecarTLS insertion --- + +func TestBuildBasePlan_InsertsObserveSidecarTLSBeforeSidecarProgressionWhenTLSEnabled(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + Sidecar: &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: tlsEnableSecretName}}, + }, + } + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(plan).NotTo(BeNil()) + + types := taskTypes(plan) + obsIdx := slices.Index(types, task.TaskTypeObserveSidecarTLS) + stsIdx := slices.Index(types, task.TaskTypeApplyStatefulSet) + configIdx := slices.Index(types, TaskConfigApply) + + g.Expect(obsIdx).To(BeNumerically(">=", 0), "observe-sidecar-tls must appear") + g.Expect(obsIdx).To(BeNumerically(">", stsIdx), "observe-sidecar-tls must follow ApplyStatefulSet") + g.Expect(obsIdx).To(BeNumerically("<", configIdx), "observe-sidecar-tls must precede the first sidecar HTTP task (config-apply)") +} + +func TestBuildBasePlan_NoObserveSidecarTLSWhenTLSDisabled(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: sourceChainID, Generation: 1}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: sourceChainID, + Image: tlsImageV1, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + plan, err := (&fullNodePlanner{}).BuildPlan(node) + g.Expect(err).NotTo(HaveOccurred()) + + g.Expect(slices.Index(taskTypes(plan), task.TaskTypeObserveSidecarTLS)).To(Equal(-1), + "observe-sidecar-tls must NOT appear when TLS is disabled") +} diff --git a/internal/planner/sidecar_tls_test.go b/internal/planner/sidecar_tls_test.go index 954b744..e70d15a 100644 --- a/internal/planner/sidecar_tls_test.go +++ b/internal/planner/sidecar_tls_test.go @@ -29,6 +29,38 @@ func TestSidecarURLForNode_TLSRoutesThroughProxyHTTPS(t *testing.T) { Spec: seiv1alpha1.SeiNodeSpec{Sidecar: &seiv1alpha1.SidecarConfig{ TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: testValidatorName + "-tls"}, }}, + Status: seiv1alpha1.SeiNodeStatus{CurrentSidecarTLSSecretName: testValidatorName + "-tls"}, + } + got := SidecarURLForNode(node) + want := fmt.Sprintf("https://%s-0.%s.sei.svc.cluster.local:8443", testValidatorName, testValidatorName) + if got != want { + t.Errorf("URL = %q, want %q", got, want) + } +} + +// TestSidecarURLForNode_TLSSpecButNotYetObserved: spec set, status empty +// → HTTP until ObserveSidecarTLS stamps the mirror. +func TestSidecarURLForNode_TLSSpecButNotYetObserved(t *testing.T) { + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, + Spec: seiv1alpha1.SeiNodeSpec{Sidecar: &seiv1alpha1.SidecarConfig{ + TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: testValidatorName + "-tls"}, + }}, + // Status.CurrentSidecarTLSSecretName intentionally empty. + } + got := SidecarURLForNode(node) + want := fmt.Sprintf("http://%s-0.%s.sei.svc.cluster.local:7777", testValidatorName, testValidatorName) + if got != want { + t.Errorf("URL = %q, want %q", got, want) + } +} + +// TestSidecarURLForNode_StatusSetWithoutSpec: status drives transport, +// not spec. +func TestSidecarURLForNode_StatusSetWithoutSpec(t *testing.T) { + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testValidatorName, Namespace: "sei"}, + Status: seiv1alpha1.SeiNodeStatus{CurrentSidecarTLSSecretName: testValidatorName + "-tls"}, } got := SidecarURLForNode(node) want := fmt.Sprintf("https://%s-0.%s.sei.svc.cluster.local:8443", testValidatorName, testValidatorName) diff --git a/internal/task/observe_sidecar_tls.go b/internal/task/observe_sidecar_tls.go new file mode 100644 index 0000000..fc067a2 --- /dev/null +++ b/internal/task/observe_sidecar_tls.go @@ -0,0 +1,84 @@ +package task + +import ( + "context" + "encoding/json" + "fmt" + + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" + "github.com/sei-protocol/sei-k8s-controller/internal/noderesource" +) + +const TaskTypeObserveSidecarTLS = "observe-sidecar-tls" + +// ObserveSidecarTLSParams identifies the node whose StatefulSet rollout +// to observe. +type ObserveSidecarTLSParams struct { + NodeName string `json:"nodeName"` + Namespace string `json:"namespace"` +} + +type observeSidecarTLSExecution struct { + taskBase + params ObserveSidecarTLSParams + cfg ExecutionConfig +} + +func deserializeObserveSidecarTLS(id string, params json.RawMessage, cfg ExecutionConfig) (TaskExecution, error) { + var p ObserveSidecarTLSParams + if len(params) > 0 { + if err := json.Unmarshal(params, &p); err != nil { + return nil, fmt.Errorf("deserializing observe-sidecar-tls params: %w", err) + } + } + return &observeSidecarTLSExecution{ + taskBase: taskBase{id: id, status: ExecutionRunning}, + params: p, + cfg: cfg, + }, nil +} + +// Execute polls StatefulSet rollout convergence and stamps +// status.currentSidecarTLSSecretName from spec on success. +func (e *observeSidecarTLSExecution) Execute(ctx context.Context) error { + node, err := ResourceAs[*seiv1alpha1.SeiNode](e.cfg) + if err != nil { + return Terminal(err) + } + if !noderesource.SidecarTLSEnabled(node) { + return Terminal(fmt.Errorf("observe-sidecar-tls scheduled but spec.sidecar.tls is nil")) + } + + sts := &appsv1.StatefulSet{} + key := types.NamespacedName{Name: node.Name, Namespace: node.Namespace} + if err := e.cfg.APIReader.Get(ctx, key, sts); err != nil { + if apierrors.IsNotFound(err) { + return nil + } + return fmt.Errorf("getting statefulset: %w", err) + } + + if sts.Status.ObservedGeneration < sts.Generation { + return nil + } + if sts.Spec.Replicas == nil { + return nil + } + // Require ReadyReplicas (proxy passed its readiness probe), not + // just UpdatedReplicas — stamping flips transport to HTTPS. + if sts.Status.UpdatedReplicas < *sts.Spec.Replicas || sts.Status.ReadyReplicas < *sts.Spec.Replicas { + return nil + } + + node.Status.CurrentSidecarTLSSecretName = node.Spec.Sidecar.TLS.SecretName + e.complete() + return nil +} + +func (e *observeSidecarTLSExecution) Status(_ context.Context) ExecutionStatus { + return e.DefaultStatus() +} diff --git a/internal/task/observe_sidecar_tls_test.go b/internal/task/observe_sidecar_tls_test.go new file mode 100644 index 0000000..d5c21e8 --- /dev/null +++ b/internal/task/observe_sidecar_tls_test.go @@ -0,0 +1,142 @@ +package task + +import ( + "context" + "encoding/json" + "errors" + "testing" + + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +func observeTLSNode(secretName string) *seiv1alpha1.SeiNode { + n := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: testReplaceSTS, Namespace: testReplaceNs}, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: opkChainID, + Image: opkImage, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + if secretName != "" { + n.Spec.Sidecar = &seiv1alpha1.SidecarConfig{TLS: &seiv1alpha1.SidecarTLSSpec{SecretName: secretName}} + } + return n +} + +func observeTLSCfg(t *testing.T, node *seiv1alpha1.SeiNode, sts *appsv1.StatefulSet) ExecutionConfig { + t.Helper() + s := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(s); err != nil { + t.Fatal(err) + } + if err := seiv1alpha1.AddToScheme(s); err != nil { + t.Fatal(err) + } + builder := fake.NewClientBuilder().WithScheme(s) + if sts != nil { + builder = builder.WithObjects(sts) + } + c := builder.Build() + return ExecutionConfig{ + KubeClient: c, + APIReader: c, + Scheme: s, + Resource: node, + } +} + +func newObserveTLSExec(t *testing.T, cfg ExecutionConfig) TaskExecution { + t.Helper() + params := ObserveSidecarTLSParams{NodeName: testReplaceSTS, Namespace: testReplaceNs} + raw, _ := json.Marshal(params) + exec, err := deserializeObserveSidecarTLS("obs-tls-1", raw, cfg) + if err != nil { + t.Fatal(err) + } + return exec +} + +func TestObserveSidecarTLS_RolloutComplete_StampsCurrent(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ + ObservedGeneration: 2, UpdatedReplicas: 1, ReadyReplicas: 1, Replicas: 1, + }, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionComplete)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(Equal("my-tls-cert")) +} + +// TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning: +// UpdatedReplicas=1 but ReadyReplicas=0 must not stamp status. +func TestObserveSidecarTLS_TemplateAppliedButProxyNotReady_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ + ObservedGeneration: 2, UpdatedReplicas: 1, ReadyReplicas: 0, Replicas: 1, + }, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(BeEmpty()) +} + +func TestObserveSidecarTLS_RolloutInProgress_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + sts := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{Name: node.Name, Namespace: node.Namespace, Generation: 2}, + Spec: appsv1.StatefulSetSpec{Replicas: int32Ptr(1)}, + Status: appsv1.StatefulSetStatus{ObservedGeneration: 2, UpdatedReplicas: 0, Replicas: 1}, + } + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, sts)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) + g.Expect(node.Status.CurrentSidecarTLSSecretName).To(BeEmpty()) +} + +func TestObserveSidecarTLS_StatefulSetNotFound_StaysRunning(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("my-tls-cert") + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, nil)) + + g.Expect(exec.Execute(context.Background())).To(Succeed()) + g.Expect(exec.Status(context.Background())).To(Equal(ExecutionRunning)) +} + +func TestObserveSidecarTLS_TLSDisabled_TerminalError(t *testing.T) { + g := NewWithT(t) + node := observeTLSNode("") // TLS not enabled + + exec := newObserveTLSExec(t, observeTLSCfg(t, node, nil)) + + err := exec.Execute(context.Background()) + g.Expect(err).To(HaveOccurred()) + var te *TerminalError + g.Expect(errors.As(err, &te)).To(BeTrue(), + "scheduling observe-sidecar-tls without spec.sidecar.tls is a planner bug; surface as Terminal") +} diff --git a/internal/task/task.go b/internal/task/task.go index ff3b7ef..bebd72f 100644 --- a/internal/task/task.go +++ b/internal/task/task.go @@ -221,6 +221,7 @@ var registry = map[string]taskDeserializer{ TaskTypeApplyRBACProxyConfig: deserializeApplyRBACProxyConfig, TaskTypeReplacePod: deserializeReplacePod, TaskTypeObserveImage: deserializeObserveImage, + TaskTypeObserveSidecarTLS: deserializeObserveSidecarTLS, TaskTypeValidateSigningKey: deserializeValidateSigningKey, TaskTypeValidateNodeKey: deserializeValidateNodeKey, TaskTypeValidateOperatorKeyring: deserializeValidateOperatorKeyring, diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 3f564f2..7760100 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -895,11 +895,12 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the - SeiNode to change TLS configuration + - message: 'spec.sidecar.tls is set-once: enabling on an existing + SeiNode is allowed, but disabling or changing requires delete + + recreate' rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) - ? (!has(self.sidecar) || !has(self.sidecar.tls)) : (has(self.sidecar) - && has(self.sidecar.tls) && self.sidecar.tls == oldSelf.sidecar.tls)' + ? true : (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls + == oldSelf.sidecar.tls)' required: - spec type: object diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 421e1c4..dd848aa 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -766,11 +766,11 @@ spec: - message: peers is required when replayer mode is set rule: '!has(self.replayer) || (has(self.peers) && size(self.peers) > 0)' - - message: spec.sidecar.tls is immutable; delete + recreate the SeiNode - to change TLS configuration - rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? (!has(self.sidecar) - || !has(self.sidecar.tls)) : (has(self.sidecar) && has(self.sidecar.tls) - && self.sidecar.tls == oldSelf.sidecar.tls)' + - message: 'spec.sidecar.tls is set-once: enabling on an existing SeiNode + is allowed, but disabling or changing requires delete + recreate' + rule: '(!has(oldSelf.sidecar) || !has(oldSelf.sidecar.tls)) ? true : + (has(self.sidecar) && has(self.sidecar.tls) && self.sidecar.tls == + oldSelf.sidecar.tls)' status: description: SeiNodeStatus defines the observed state of a SeiNode. properties: @@ -841,6 +841,13 @@ spec: Parent controllers compare this against spec.image to determine whether a spec change has been fully actuated. type: string + currentSidecarTLSSecretName: + description: |- + CurrentSidecarTLSSecretName is the spec.sidecar.tls.secretName + observed on the live pod after the rollout converged. Drives the + controller's sidecar client transport-mode selection (HTTP vs + HTTPS). Empty when no TLS rollout has converged yet. + type: string externalAddress: description: |- ExternalAddress is the routable P2P address (host:port) for this node,