From 8e84a1721923fd2d8ecbf5021805199f1456f577 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 15 May 2026 15:41:48 -0700 Subject: [PATCH 1/5] feat(noderesource): add cosmos-exporter sidecar for in-pod Cosmos metrics (#248) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an opt-in sei-cosmos-exporter sidecar to the seid pod so the in-pod Cosmos gRPC + Tendermint RPC surface stays observable on K8s, matching the legacy sei-infra systemd-unit deployment. Discovery is label-based: a single platform-owned PodMonitor (per chain, lives in platform repo) selects on `monitoring.sei.io/cosmos-exporter: enabled`. Scope: container + pod labels only. The PodMonitor manifest is a follow-up PR in the platform repo. Image consumed from `SEI_COSMOS_EXPORTER_IMAGE` on the operator Deployment (separate PR publishes that image on sei-cosmos-exporter). Pod labels (stamped unconditionally on every seid pod): - sei.io/chain= → lifted into `chain_id` metric label - sei.io/role= → lifted into `component` - monitoring.sei.io/cosmos-exporter=enabled (only when opted in) → PodMonitor selector Container details: - Sidecar in spec.Containers (regular container, not init-sidecar) — cosmos-exporter polls localhost so seid readiness order doesn't matter - Fixed port 9300, named `cosmos-metrics` (PodMonitor targets the name) - sei-only args: --denom usei --denom-coefficient 1000000 --bech-prefix sei - TCP startup probe on seid's gRPC (9090) — gates exporter start so it doesn't log.Fatal() on initial dial during seid bring-up - /tmp emptyDir mount (defensive against EROFS under ReadOnlyRootFilesystem) - Resources: 50m/64Mi req, 384Mi mem limit, no CPU limit (CPU limits turn scrape-time gRPC pulls into visible scrape gaps) - Hard-fails GenerateStatefulSet when no image is configured anywhere (per-node Image override OR platform CosmosExporterImage env) — mirrors the KubeRBACProxyImage check pattern CRD: `CosmosExporterConfig` is intentionally minimal — Image (debug knob) + Resources only. Port is not user-configurable because the PodMonitor selects via named container port; per-node port overrides would silently break discovery. Closes #248 --- api/v1alpha1/common_types.go | 34 +++ api/v1alpha1/seinode_types.go | 9 + api/v1alpha1/zz_generated.deepcopy.go | 25 ++ cmd/main.go | 1 + config/crd/sei.io_seinodedeployments.yaml | 84 ++++++ config/crd/sei.io_seinodes.yaml | 83 ++++++ internal/noderesource/noderesource.go | 198 +++++++++++++- internal/noderesource/noderesource_test.go | 302 ++++++++++++++++++++- internal/platform/platform.go | 8 + internal/platform/platformtest/config.go | 3 + manifests/sei.io_seinodedeployments.yaml | 84 ++++++ manifests/sei.io_seinodes.yaml | 83 ++++++ 12 files changed, 899 insertions(+), 15 deletions(-) diff --git a/api/v1alpha1/common_types.go b/api/v1alpha1/common_types.go index cc4c9dd..4016cbb 100644 --- a/api/v1alpha1/common_types.go +++ b/api/v1alpha1/common_types.go @@ -199,6 +199,40 @@ type SidecarConfig struct { TLS *SidecarTLSSpec `json:"tls,omitempty"` } +// CosmosExporterConfig enables an in-pod sei-cosmos-exporter container +// that polls seid's local Cosmos gRPC + Tendermint RPC and exposes +// Prometheus metrics on a fixed node-local port. Presence of this +// struct (even empty) enables the container; absence leaves the pod +// alone. +// +// Designed as a sidecar in the same pod so the exporter's +// localhost-bound RPC assumption holds, per-pod identity (`pod` label) +// flows through into metrics, and Karpenter consolidation never +// collides on the listen port (which a DaemonSet would). +// +// The exporter always binds the upstream default port (9300) and the +// container exposes it as the named port `cosmos-metrics`. Platform +// PodMonitors target the named port, so per-node port overrides have +// no use — the field is intentionally absent here. +type CosmosExporterConfig struct { + // Image overrides the cosmos-exporter container image. When unset, + // the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE + // platform default (mirrors how Sidecar.Image works). Primarily a + // debug/canary knob — production deployments rely on the platform + // default so image bumps are a single env-var change on the + // operator Deployment. + // +optional + Image string `json:"image,omitempty"` + + // Resources defines CPU/memory requests and limits for the + // cosmos-exporter container. When unset, the controller applies + // scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem + // limit, no CPU limit — CPU limits turn scrape-time pulls into + // visible scrape gaps under throttling). + // +optional + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` +} + // SidecarTLSSpec configures the cert-manager-issued serving cert for // the kube-rbac-proxy fronting. type SidecarTLSSpec struct { diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index e8ad503..324f43f 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -45,6 +45,15 @@ type SeiNodeSpec struct { // +optional Sidecar *SidecarConfig `json:"sidecar,omitempty"` + // CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod + // sidecar container scraping seid's local Cosmos gRPC + Tendermint + // RPC and exposing Prometheus metrics on :9300. The companion + // PodMonitor (emitted by the SeiNodeDeployment controller when its + // .spec.monitoring.cosmosExporter is also set) discovers and + // scrapes these endpoints. + // +optional + CosmosExporter *CosmosExporterConfig `json:"cosmosExporter,omitempty"` + // PodLabels are additional labels merged into the StatefulSet pod template. // The controller always sets sei.io/node; these are additive and applied // first so that system labels take precedence. diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index efa7970..8e30744 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -31,6 +31,26 @@ func (in *ArchiveSpec) DeepCopy() *ArchiveSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CosmosExporterConfig) DeepCopyInto(out *CosmosExporterConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CosmosExporterConfig. +func (in *CosmosExporterConfig) DeepCopy() *CosmosExporterConfig { + if in == nil { + return nil + } + out := new(CosmosExporterConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataVolumeImport) DeepCopyInto(out *DataVolumeImport) { *out = *in @@ -910,6 +930,11 @@ func (in *SeiNodeSpec) DeepCopyInto(out *SeiNodeSpec) { *out = new(SidecarConfig) (*in).DeepCopyInto(*out) } + if in.CosmosExporter != nil { + in, out := &in.CosmosExporter, &out.CosmosExporter + *out = new(CosmosExporterConfig) + (*in).DeepCopyInto(*out) + } if in.PodLabels != nil { in, out := &in.PodLabels, &out.PodLabels *out = make(map[string]string, len(*in)) diff --git a/cmd/main.go b/cmd/main.go index 0ef6dd8..ed67e56 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -158,6 +158,7 @@ func main() { GatewayPublicDomain: os.Getenv("SEI_GATEWAY_PUBLIC_DOMAIN"), KubeRBACProxyImage: os.Getenv("SEI_KUBE_RBAC_PROXY_IMAGE"), SidecarImage: os.Getenv("SEI_SIDECAR_IMAGE"), + CosmosExporterImage: os.Getenv("SEI_COSMOS_EXPORTER_IMAGE"), } if err := platformCfg.Validate(); err != nil { diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 329ad8d..60a8523 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -225,6 +225,90 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string + cosmosExporter: + description: |- + CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod + sidecar container scraping seid's local Cosmos gRPC + Tendermint + RPC and exposing Prometheus metrics on :9300. The companion + PodMonitor (emitted by the SeiNodeDeployment controller when its + .spec.monitoring.cosmosExporter is also set) discovers and + scrapes these endpoints. + properties: + image: + description: |- + Image overrides the cosmos-exporter container image. When unset, + the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE + platform default (mirrors how Sidecar.Image works). Primarily a + debug/canary knob — production deployments rely on the platform + default so image bumps are a single env-var change on the + operator Deployment. + type: string + resources: + description: |- + Resources defines CPU/memory requests and limits for the + cosmos-exporter container. When unset, the controller applies + scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem + limit, no CPU limit — CPU limits turn scrape-time pulls into + visible scrape gaps under throttling). + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry + in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index dad2a1c..79bb1b6 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -87,6 +87,89 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string + cosmosExporter: + description: |- + CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod + sidecar container scraping seid's local Cosmos gRPC + Tendermint + RPC and exposing Prometheus metrics on :9300. The companion + PodMonitor (emitted by the SeiNodeDeployment controller when its + .spec.monitoring.cosmosExporter is also set) discovers and + scrapes these endpoints. + properties: + image: + description: |- + Image overrides the cosmos-exporter container image. When unset, + the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE + platform default (mirrors how Sidecar.Image works). Primarily a + debug/canary knob — production deployments rely on the platform + default so image bumps are a single env-var change on the + operator Deployment. + type: string + resources: + description: |- + Resources defines CPU/memory requests and limits for the + cosmos-exporter container. When unset, the controller applies + scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem + limit, no CPU limit — CPU limits turn scrape-time pulls into + visible scrape gaps under throttling). + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index c54fbbf..41a6b1c 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -26,6 +26,19 @@ const ( // NodeLabel is the standard label key used on all SeiNode-owned resources. NodeLabel = "sei.io/node" + // chainLabel is the per-pod label carrying the SeiNode's ChainID. + // Platform-owned ServiceMonitors and PodMonitors lift this into the + // `chain_id` metric label via __meta_kubernetes_pod_label_sei_io_chain + // relabeling. NOT in the StatefulSet selector — chain doesn't change + // on a running SeiNode and pod-template-only stamping keeps the + // selector immutable. + chainLabel = "sei.io/chain" + + // roleLabel is the per-pod label carrying the node mode. Values: + // validator, archive, replayer, node. Same lift-via-relabel pattern + // as chainLabel; surfaces as the `component` metric label. + roleLabel = "sei.io/role" + dataDir = platform.DataDir // homeVarRef is the K8s VariableReference form of HOME, substituted from @@ -45,6 +58,7 @@ const ( containerNameSeid = "seid" containerNameSidecar = "sei-sidecar" containerNameRBACProxy = "kube-rbac-proxy" + containerNameCosmosExporter = "cosmos-exporter" servicePortNameAPI = "api" rbacProxyConfigVolumeName = "rbac-proxy-config" sidecarTLSVolumeName = "sidecar-tls" @@ -78,6 +92,21 @@ const ( // chainguard static-debian12 base images. Pod-level fsGroup matches so // the non-root sidecar can read kubelet-projected 0o400 Secret files. sidecarNonRootUID int64 = 65532 + + // defaultCosmosExporterPort matches sei-cosmos-exporter's upstream + // default (main.go:302) and the legacy EC2 + // install_sei_cosmos_exporter.sh:9300. Fixed — no per-node override: + // the platform PodMonitor selects via the named port + // `cosmos-metrics`, so the value is opaque to consumers. + defaultCosmosExporterPort int32 = 9300 + + // cosmosExporterScrapeLabel is the pod label stamped on every + // pod opting into cosmos-exporter scraping. A single cluster-wide + // PodMonitor in the platform repo (clusters//monitoring/) + // selects on this label and discovers all opted-in pods across + // namespaces. + cosmosExporterScrapeLabel = "monitoring.sei.io/cosmos-exporter" + cosmosExporterScrapeLabelValue = "enabled" ) // PlatformConfig is an alias for platform.Config. @@ -101,15 +130,56 @@ func SelectorLabels(node *seiv1alpha1.SeiNode) map[string]string { } // ResourceLabels returns labels for the StatefulSet pod template. -// User-provided podLabels are applied first; the system sei.io/node label -// is set last so it cannot be overridden. +// User-provided podLabels are applied first; system labels (sei.io/node +// and observability discovery labels) are set last so they cannot be +// overridden. func ResourceLabels(node *seiv1alpha1.SeiNode) map[string]string { - labels := make(map[string]string, len(node.Spec.PodLabels)+1) + labels := make(map[string]string, len(node.Spec.PodLabels)+4) maps.Copy(labels, node.Spec.PodLabels) labels[NodeLabel] = node.Name + // sei.io/chain and sei.io/role are observability identity labels: + // platform-owned PodMonitors and ServiceMonitors lift them into + // metric labels (chain_id, component) via + // __meta_kubernetes_pod_label_* relabeling instead of carrying the + // values in their own YAML. Stamped unconditionally so every seid + // pod is queryable by chain and role regardless of which exporter + // is enabled. + if node.Spec.ChainID != "" { + labels[chainLabel] = node.Spec.ChainID + } + if role := deriveRole(node); role != "" { + labels[roleLabel] = role + } + // Platform-owned discovery label: a single PodMonitor per chain + // in the observability stack selects on this and scrapes the + // cosmos-exporter sidecar's /metrics/* endpoints. Stamping the + // label here keeps the per-pod opt-in concern in the controller + // while the scrape policy stays in the platform repo (no per-SND + // PodMonitor reconcile loop). + if CosmosExporterEnabled(node) { + labels[cosmosExporterScrapeLabel] = cosmosExporterScrapeLabelValue + } return labels } +// deriveRole returns the role label value for the node mode, matching +// the values nodedeployment.deriveComponent emits for ServiceMonitor +// relabeling. Centralized here so the pod-label and the relabel-output +// stay in lock-step. Empty string when no mode is set. +func deriveRole(node *seiv1alpha1.SeiNode) string { + switch { + case node.Spec.Validator != nil: + return "validator" + case node.Spec.Archive != nil: + return "archive" + case node.Spec.Replayer != nil: + return "replayer" + case node.Spec.FullNode != nil: + return "node" + } + return "" +} + // NodeMode returns the sei-config mode string for the node based on which // sub-spec is populated. Falls back to "full" if none is set. func NodeMode(node *seiv1alpha1.SeiNode) string { @@ -181,7 +251,10 @@ func GenerateStatefulSet(node *seiv1alpha1.SeiNode, p PlatformConfig) (*appsv1.S } one := int32(1) labels := ResourceLabels(node) - podSpec := buildNodePodSpec(node, p) + podSpec, err := buildNodePodSpec(node, p) + if err != nil { + return nil, err + } if err := assertNoOperatorKeyringOnSeidContainers(node, &podSpec); err != nil { return nil, err @@ -364,7 +437,7 @@ func ServicePorts() []corev1.ServicePort { // Internal helpers // --------------------------------------------------------------------------- -func buildNodePodSpec(node *seiv1alpha1.SeiNode, p PlatformConfig) corev1.PodSpec { +func buildNodePodSpec(node *seiv1alpha1.SeiNode, p PlatformConfig) (corev1.PodSpec, error) { dataVolume := corev1.Volume{ Name: "data", VolumeSource: corev1.VolumeSource{ @@ -444,9 +517,17 @@ func buildNodePodSpec(node *seiv1alpha1.SeiNode, p PlatformConfig) corev1.PodSpe initContainers = append(initContainers, buildRBACProxyContainer(node, p)) } spec.InitContainers = initContainers - spec.Containers = []corev1.Container{buildSidecarMainContainer(node, p)} + containers := []corev1.Container{buildSidecarMainContainer(node, p)} + if CosmosExporterEnabled(node) { + ceContainer, err := buildCosmosExporterContainer(node, p) + if err != nil { + return corev1.PodSpec{}, err + } + containers = append(containers, ceContainer) + } + spec.Containers = containers - return spec + return spec, nil } func sidecarImage(node *seiv1alpha1.SeiNode, p PlatformConfig) string { @@ -553,6 +634,109 @@ func buildSidecarMainContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) core return container } +// CosmosExporterEnabled reports whether the SeiNode opts into running +// the sei-cosmos-exporter sidecar via spec.cosmosExporter (any non-nil +// value, even an empty struct, enables it). +func CosmosExporterEnabled(node *seiv1alpha1.SeiNode) bool { + return node.Spec.CosmosExporter != nil +} + +func cosmosExporterImage(node *seiv1alpha1.SeiNode, p PlatformConfig) string { + if node.Spec.CosmosExporter != nil && node.Spec.CosmosExporter.Image != "" { + return node.Spec.CosmosExporter.Image + } + return p.CosmosExporterImage +} + +// defaultCosmosExporterResources returns scrape-friendly defaults. No +// CPU limit on purpose: cosmos-exporter pulls from seid's local gRPC +// on every scrape, and CPU throttling at the limit boundary turns those +// pulls into visible scrape gaps. Memory ceiling caps blast radius. +// 384Mi target headroom for /metrics/validators paginated delegation +// pulls on large-validator-set chains (Limit defaults to 1000 in +// upstream cosmos-exporter at main.go:305). +func defaultCosmosExporterResources() corev1.ResourceRequirements { + return corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("384Mi"), + }, + } +} + +// buildCosmosExporterContainer renders the sei-cosmos-exporter sidecar +// that polls seid's local Cosmos gRPC (localhost:9090) + Tendermint RPC +// (http://localhost:26657) and exposes Prometheus metrics on :9300. +// Discovered by a platform-owned PodMonitor selecting on the +// monitoring.sei.io/cosmos-exporter pod label. +// +// Args track sei-infra's legacy systemd unit +// (sei-cosmos-exporter/install-sei-cosmos-exporter.sh): usei bond denom, +// 1_000_000 coefficient (6-decimal display), `sei` bech32 prefix. +// +// Returns an error when no image source is configured (neither the +// per-node Image override nor the platform-level SEI_COSMOS_EXPORTER_IMAGE +// env): an empty Image field would surface as a confusing ErrImagePull +// at pod scheduling time. Mirrors the KubeRBACProxyImage fail-fast +// pattern in GenerateStatefulSet. +func buildCosmosExporterContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) (corev1.Container, error) { + image := cosmosExporterImage(node, p) + if image == "" { + return corev1.Container{}, fmt.Errorf("cosmos-exporter image is required: set SEI_COSMOS_EXPORTER_IMAGE on the operator Deployment or override .spec.cosmosExporter.image on the SeiNode") + } + c := corev1.Container{ + Name: containerNameCosmosExporter, + Image: image, + Args: []string{ + "--denom", "usei", + "--denom-coefficient", "1000000", + "--bech-prefix", "sei", + "--listen-address", fmt.Sprintf(":%d", defaultCosmosExporterPort), + // --node defaults to localhost:9090 (Cosmos gRPC). + // --tendermint-rpc defaults to http://localhost:26657. + // Both are pod-local because the exporter runs in the same + // pod as seid; no override needed. + }, + Ports: []corev1.ContainerPort{ + {Name: "cosmos-metrics", ContainerPort: defaultCosmosExporterPort, Protocol: corev1.ProtocolTCP}, + }, + SecurityContext: sidecarSecurityContext(), + Resources: defaultCosmosExporterResources(), + // Distroless + ReadOnlyRootFilesystem means /tmp would EROFS if + // the Go runtime or gRPC stack ever stages there. Reuse the + // sidecar-tmp emptyDir already in the pod's volume list at + // buildNodePodSpec — cheap insurance. + VolumeMounts: []corev1.VolumeMount{ + {Name: sidecarTmpVolumeName, MountPath: "/tmp"}, + }, + // cosmos-exporter calls setChainID() + setDenom() against seid's + // gRPC at startup (main.go:162-163) and log.Fatal()s on first + // dial failure (main.go:159). Without a startup gate, the + // exporter crash-loops until seid's gRPC is up — cosmetic but + // noisy in dashboards. TCP-probe seid's gRPC port and treat + // that as our readiness-to-start signal. seiconfig.PortGRPC is + // unconditionally exposed by buildNodeMainContainer via + // ContainerPorts(). + StartupProbe: &corev1.Probe{ + ProbeHandler: corev1.ProbeHandler{ + TCPSocket: &corev1.TCPSocketAction{ + Port: intstr.FromInt32(seiconfig.PortGRPC), + }, + }, + InitialDelaySeconds: 5, + PeriodSeconds: 5, + FailureThreshold: 60, + }, + } + if node.Spec.CosmosExporter != nil && node.Spec.CosmosExporter.Resources != nil { + c.Resources = *node.Spec.CosmosExporter.Resources + } + return c, nil +} + func sidecarWaitCommand(node *seiv1alpha1.SeiNode) (command []string, args []string) { // Canonical seid invocation; spec.Entrypoint is silently ignored as of // HOME-based path resolution. "$HOME" (shell-expanded inside bash -c) diff --git a/internal/noderesource/noderesource_test.go b/internal/noderesource/noderesource_test.go index 07fb0de..f5e320a 100644 --- a/internal/noderesource/noderesource_test.go +++ b/internal/noderesource/noderesource_test.go @@ -91,12 +91,18 @@ func mustGenerateStatefulSet(t *testing.T, node *seiv1alpha1.SeiNode, p Platform // --- Pod labels --- -func TestResourceLabelsForNode_DefaultsToNodeOnly(t *testing.T) { +func TestResourceLabelsForNode_DefaultsToSystemLabels(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("snap-0", "default") labels := ResourceLabels(node) - g.Expect(labels).To(Equal(map[string]string{NodeLabel: "snap-0"})) + // newSnapshotNode sets ChainID="sei-test" + FullNode mode, so chain + // + role labels are stamped alongside sei.io/node. + g.Expect(labels).To(Equal(map[string]string{ + NodeLabel: "snap-0", + "sei.io/chain": "sei-test", + "sei.io/role": "node", + })) } func TestResourceLabelsForNode_MergesPodLabels(t *testing.T) { @@ -110,8 +116,10 @@ func TestResourceLabelsForNode_MergesPodLabels(t *testing.T) { g.Expect(labels).To(Equal(map[string]string{ NodeLabel: "snap-0", - "sei.io/nodedeployment": "my-group", - "team": "platform", + "sei.io/chain": "sei-test", + "sei.io/role": "node", + "sei.io/nodedeployment": "my-group", + "team": "platform", })) } @@ -185,7 +193,8 @@ func TestBuildNodePodSpec_Genesis_MountsExistingPVC(t *testing.T) { g := NewWithT(t) node := newGenesisNode("mynet-0", "default") - spec := buildNodePodSpec(node, platformtest.Config()) + spec, err := buildNodePodSpec(node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) g.Expect(spec.ServiceAccountName).To(Equal(platformtest.Config().ServiceAccount)) g.Expect(spec.Volumes).To(HaveLen(2)) // data PVC + sidecar-tmp emptyDir @@ -198,7 +207,8 @@ func TestBuildNodePodSpec_Snapshot_MountsNodePVC(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("snap-0", "default") - spec := buildNodePodSpec(node, platformtest.Config()) + spec, err := buildNodePodSpec(node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) g.Expect(spec.Volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("data-snap-0")) } @@ -718,7 +728,8 @@ func TestBuildNodePodSpec_Archive_SchedulesOnArchiveNodepool(t *testing.T) { g := NewWithT(t) node := newArchiveNode("archive-0", "pacific-1") - spec := buildNodePodSpec(node, platformtest.Config()) + spec, err := buildNodePodSpec(node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) g.Expect(spec.Tolerations).To(HaveLen(1)) g.Expect(spec.Tolerations[0].Key).To(Equal("sei.io/workload")) @@ -735,7 +746,8 @@ func TestBuildNodePodSpec_FullNode_SchedulesOnDefaultNodepool(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("syncer-0", "pacific-1") - spec := buildNodePodSpec(node, platformtest.Config()) + spec, err := buildNodePodSpec(node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) g.Expect(spec.Tolerations[0].Value).To(Equal("sei-node")) @@ -1279,3 +1291,277 @@ func TestGenerateStatefulSet_ProductionPodSpec_PassesGuard(t *testing.T) { _, err := GenerateStatefulSet(node, platformtest.Config()) g.Expect(err).NotTo(HaveOccurred()) } + +// --- Cosmos exporter --- + +func TestCosmosExporter_AbsentByDefault(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + g.Expect(findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter)).To(BeNil()) + g.Expect(sts.Spec.Template.Spec.Containers).To(HaveLen(1)) +} + +func TestCosmosExporter_PresentWhenOptedIn(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + g.Expect(ce).NotTo(BeNil()) + g.Expect(sts.Spec.Template.Spec.Containers).To(HaveLen(2)) +} + +func TestCosmosExporter_DefaultImage(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + g.Expect(ce.Image).To(Equal(platformtest.Config().CosmosExporterImage)) +} + +func TestCosmosExporter_CustomImageOverride(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{Image: "custom/exporter:v9"} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + g.Expect(ce.Image).To(Equal("custom/exporter:v9")) +} + +func TestCosmosExporter_DefaultPort(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + g.Expect(ce.Ports).To(HaveLen(1)) + g.Expect(ce.Ports[0].ContainerPort).To(Equal(int32(9300))) + g.Expect(ce.Ports[0].Name).To(Equal("cosmos-metrics")) + g.Expect(ce.Args).To(ContainElement(":9300")) +} + +func TestCosmosExporter_PortIsFixed(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + // Port is intentionally not user-configurable: the platform + // PodMonitor targets the named port `cosmos-metrics`. + g.Expect(ce.Ports[0].ContainerPort).To(Equal(int32(9300))) + g.Expect(ce.Ports[0].Name).To(Equal("cosmos-metrics")) +} + +func TestCosmosExporter_ErrorWhenImageUnset(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + cfg := platformtest.Config() + cfg.CosmosExporterImage = "" + + _, err := GenerateStatefulSet(node, cfg) + + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("cosmos-exporter image is required")) +} + +func TestCosmosExporter_StartupProbeOnSeidGRPC(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + // Startup probe gates ListenAndServe on seid's gRPC being up so the + // exporter doesn't log.Fatal() on its initial dial. + g.Expect(ce.StartupProbe).NotTo(BeNil()) + g.Expect(ce.StartupProbe.TCPSocket).NotTo(BeNil()) + g.Expect(ce.StartupProbe.TCPSocket.Port.IntVal).To(Equal(int32(9090))) +} + +func TestCosmosExporter_MountsTmpEmptyDir(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + var hasTmp bool + for _, m := range ce.VolumeMounts { + if m.Name == sidecarTmpVolumeName && m.MountPath == "/tmp" { + hasTmp = true + break + } + } + g.Expect(hasTmp).To(BeTrue(), "cosmos-exporter must mount sidecar-tmp at /tmp (ReadOnlyRootFilesystem)") +} + +func TestCosmosExporter_SeiArgs(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + g.Expect(ce.Args).To(ContainElements( + "--denom", "usei", + "--denom-coefficient", "1000000", + "--bech-prefix", "sei", + )) +} + +func TestCosmosExporter_DefaultResources(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + // 50m/64Mi requests, 256Mi memory limit, no CPU limit (see + // defaultCosmosExporterResources — scrape pulls would throttle). + cpuReq := ce.Resources.Requests[corev1.ResourceCPU] + memReq := ce.Resources.Requests[corev1.ResourceMemory] + memLim := ce.Resources.Limits[corev1.ResourceMemory] + g.Expect(cpuReq.String()).To(Equal("50m")) + g.Expect(memReq.String()).To(Equal("64Mi")) + g.Expect(memLim.String()).To(Equal("384Mi")) + _, hasCPULimit := ce.Resources.Limits[corev1.ResourceCPU] + g.Expect(hasCPULimit).To(BeFalse()) +} + +func TestCosmosExporter_CustomResourcesOverride(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{ + Resources: &corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + }, + } + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + cpuReq := ce.Resources.Requests[corev1.ResourceCPU] + g.Expect(cpuReq.String()).To(Equal("100m")) +} + +func TestResourceLabels_ChainAndRoleStampedUnconditionally(t *testing.T) { + g := NewWithT(t) + tests := []struct { + name string + mutate func(*seiv1alpha1.SeiNode) + expected string + }{ + {"validator", func(n *seiv1alpha1.SeiNode) { n.Spec.Validator = &seiv1alpha1.ValidatorSpec{} }, "validator"}, + {"archive", func(n *seiv1alpha1.SeiNode) { n.Spec.Archive = &seiv1alpha1.ArchiveSpec{} }, "archive"}, + {"fullNode", func(n *seiv1alpha1.SeiNode) { n.Spec.FullNode = &seiv1alpha1.FullNodeSpec{} }, "node"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: "ns"}, + Spec: seiv1alpha1.SeiNodeSpec{ChainID: "pacific-1", Image: "ghcr.io/sei-protocol/seid:latest"}, + } + tt.mutate(node) + + labels := ResourceLabels(node) + + g.Expect(labels).To(HaveKeyWithValue("sei.io/chain", "pacific-1")) + g.Expect(labels).To(HaveKeyWithValue("sei.io/role", tt.expected)) + }) + } +} + +func TestResourceLabels_ChainOmittedWhenChainIDEmpty(t *testing.T) { + g := NewWithT(t) + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{Name: "n", Namespace: "ns"}, + Spec: seiv1alpha1.SeiNodeSpec{FullNode: &seiv1alpha1.FullNodeSpec{}}, + } + + labels := ResourceLabels(node) + + g.Expect(labels).NotTo(HaveKey("sei.io/chain")) +} + +func TestResourceLabels_NotInSelector(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + // Chain + role must NOT live in the immutable StatefulSet selector + // — otherwise renaming the chain (rare) or rolling between modes + // would require StatefulSet recreation. Only sei.io/node belongs + // in the selector. + g.Expect(sts.Spec.Selector.MatchLabels).NotTo(HaveKey("sei.io/chain")) + g.Expect(sts.Spec.Selector.MatchLabels).NotTo(HaveKey("sei.io/role")) + g.Expect(sts.Spec.Selector.MatchLabels).To(HaveLen(1)) +} + +func TestCosmosExporter_PodLabelAbsentByDefault(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + g.Expect(sts.Spec.Template.Labels).NotTo(HaveKey("monitoring.sei.io/cosmos-exporter")) +} + +func TestCosmosExporter_PodLabelPresentWhenOptedIn(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + g.Expect(sts.Spec.Template.Labels).To(HaveKeyWithValue("monitoring.sei.io/cosmos-exporter", "enabled")) +} + +func TestCosmosExporter_PodLabelNotOnSelector(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + + // The scrape label must NOT live in the immutable StatefulSet + // selector — otherwise toggling cosmos-exporter on or off would + // require StatefulSet recreation. + g.Expect(sts.Spec.Selector.MatchLabels).NotTo(HaveKey("monitoring.sei.io/cosmos-exporter")) +} + +func TestCosmosExporter_NonRootSecurityContext(t *testing.T) { + g := NewWithT(t) + node := newSnapshotNode("ce-0", "default") + node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + + sts := mustGenerateStatefulSet(t, node, platformtest.Config()) + ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) + + g.Expect(ce.SecurityContext).NotTo(BeNil()) + g.Expect(ce.SecurityContext.RunAsNonRoot).NotTo(BeNil()) + g.Expect(*ce.SecurityContext.RunAsNonRoot).To(BeTrue()) + g.Expect(*ce.SecurityContext.RunAsUser).To(Equal(int64(65532))) +} diff --git a/internal/platform/platform.go b/internal/platform/platform.go index 713d2c6..5f3e4e1 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -51,6 +51,14 @@ type Config struct { KubeRBACProxyImage string SidecarImage string + + // CosmosExporterImage is the default sei-cosmos-exporter image used + // for the in-pod cosmos-exporter sidecar. Optional: only required + // when at least one SeiNode opts in via .spec.cosmosExporter. Empty + // values cause buildCosmosExporterContainer to fall back to a + // per-node Spec.CosmosExporter.Image override; if neither is set + // the controller refuses to build the container. + CosmosExporterImage string } // NodepoolForMode returns the Karpenter NodePool name for the given diff --git a/internal/platform/platformtest/config.go b/internal/platform/platformtest/config.go index 37de0ba..0497c69 100644 --- a/internal/platform/platformtest/config.go +++ b/internal/platform/platformtest/config.go @@ -34,5 +34,8 @@ func Config() platform.Config { // Arbitrary fixture; not authoritative. Production digest is set // via SEI_SIDECAR_IMAGE in the platform repo's controller Deployment. SidecarImage: "ghcr.io/sei-protocol/seictl@sha256:a2af4e1b8ed4c12661a3c98cce050bae3f292cc7560abc2ba98fd7dfc80d9be5", + // Production digest is set via SEI_COSMOS_EXPORTER_IMAGE in the + // platform repo's controller Deployment. + CosmosExporterImage: "ghcr.io/sei-protocol/sei-cosmos-exporter@sha256:0000000000000000000000000000000000000000000000000000000000000000", } } diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 329ad8d..60a8523 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -225,6 +225,90 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string + cosmosExporter: + description: |- + CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod + sidecar container scraping seid's local Cosmos gRPC + Tendermint + RPC and exposing Prometheus metrics on :9300. The companion + PodMonitor (emitted by the SeiNodeDeployment controller when its + .spec.monitoring.cosmosExporter is also set) discovers and + scrapes these endpoints. + properties: + image: + description: |- + Image overrides the cosmos-exporter container image. When unset, + the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE + platform default (mirrors how Sidecar.Image works). Primarily a + debug/canary knob — production deployments rely on the platform + default so image bumps are a single env-var change on the + operator Deployment. + type: string + resources: + description: |- + Resources defines CPU/memory requests and limits for the + cosmos-exporter container. When unset, the controller applies + scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem + limit, no CPU limit — CPU limits turn scrape-time pulls into + visible scrape gaps under throttling). + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry + in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index dad2a1c..79bb1b6 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -87,6 +87,89 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string + cosmosExporter: + description: |- + CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod + sidecar container scraping seid's local Cosmos gRPC + Tendermint + RPC and exposing Prometheus metrics on :9300. The companion + PodMonitor (emitted by the SeiNodeDeployment controller when its + .spec.monitoring.cosmosExporter is also set) discovers and + scrapes these endpoints. + properties: + image: + description: |- + Image overrides the cosmos-exporter container image. When unset, + the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE + platform default (mirrors how Sidecar.Image works). Primarily a + debug/canary knob — production deployments rely on the platform + default so image bumps are a single env-var change on the + operator Deployment. + type: string + resources: + description: |- + Resources defines CPU/memory requests and limits for the + cosmos-exporter container. When unset, the controller applies + scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem + limit, no CPU limit — CPU limits turn scrape-time pulls into + visible scrape gaps under throttling). + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + type: object dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. From e3a4b554cabff59427f93819c74f85c137de2611 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 15 May 2026 15:48:50 -0700 Subject: [PATCH 2/5] refactor: collapse CosmosExporterConfig to bool + trim comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-node Image and Resources overrides removed: fleet uniformity is the contract, and the CRD shouldn't expose knobs nobody should turn. Image is operator-controlled via SEI_COSMOS_EXPORTER_IMAGE; resources are constants in defaultCosmosExporterResources. SeiNodeSpec.CosmosExporter is now `bool` — reads as what it is, a feature toggle. Spec: `cosmosExporter: true` instead of `{}`. Also trims verbose doc comments throughout to load-bearing only: - Why no CPU limit (scrape-time gRPC pulls) - Why TCP probe on seid gRPC (cosmos-exporter Fatal()s otherwise) - Why /tmp emptyDir (RoRootFS insurance) Net diff: -498 LOC (mostly CRD YAML for the removed struct fields). --- .council/workstream.yaml | 127 ++++++++++++++++++++ api/v1alpha1/common_types.go | 34 ------ api/v1alpha1/seinode_types.go | 10 +- api/v1alpha1/zz_generated.deepcopy.go | 25 ---- config/crd/sei.io_seinodedeployments.yaml | 85 +------------- config/crd/sei.io_seinodes.yaml | 84 +------------- internal/noderesource/noderesource.go | 129 +++++---------------- internal/noderesource/noderesource_test.go | 68 ++--------- internal/platform/platform.go | 8 +- internal/platform/platformtest/config.go | 2 - manifests/sei.io_seinodedeployments.yaml | 85 +------------- manifests/sei.io_seinodes.yaml | 84 +------------- 12 files changed, 185 insertions(+), 556 deletions(-) create mode 100644 .council/workstream.yaml diff --git a/.council/workstream.yaml b/.council/workstream.yaml new file mode 100644 index 0000000..f7b70fb --- /dev/null +++ b/.council/workstream.yaml @@ -0,0 +1,127 @@ +workstream: + description: "ValidationRun CRD LLD — design pass for ephemeral-chain validation workloads on Harbor" + tier: component + started: "2026-04-28T20:00:00Z" + updated: "2026-04-28T22:30:00Z" + source_issue: "https://github.com/sei-protocol/sei-k8s-controller/issues/139" + companion_issue: "https://github.com/sei-protocol/platform/issues/235" + decision_posture: "design pass — implementation is separate workstreams" + +phases: + - name: "Foundation" + status: completed + completed_at: "2026-04-28T20:00:00Z" + outputs: + - ".council/workstream.yaml" + + - name: "Round 1 — Parallel specialist input gathering" + status: completed + completed_at: "2026-04-28T20:30:00Z" + outputs: + - "/tmp/round1/pm-scope-cuts.md" + - "/tmp/round1/otel-rule-semantics.md" + - "/tmp/round1/platform-integration.md" + + - name: "Round 2 — LLD draft synthesis (kubernetes-specialist)" + status: completed + completed_at: "2026-04-28T21:00:00Z" + outputs: + - "/tmp/lld-draft.md (~1300 lines, 8500 words)" + + - name: "Coral mid-council escalation — Argo Workflows pivot question" + status: completed + completed_at: "2026-04-28T22:00:00Z" + outputs: + - "Unanimous Path X (custom CRD) recommendation from all four specialists" + - "Argo un-defer triggers documented for Future Work section" + notes: | + User asked late in design pass whether to abandon custom CRD for Argo + Workflows. Engaged all four specialists in parallel. Unanimous: Path X. + Load-bearing reason: Argo can't natively own non-Pod CRDs cascade-style; + SND-as-cascading-child is central organizing primitive. Continuous-mode + polling makes Argo's per-step pod cost prohibitive (300 pods/run worst + case). Argo not currently deployed on Harbor — adoption is a multi-week + platform commitment. Re-open conversation on documented triggers. + + - name: "One-way-door gate — CRD field name approvals" + status: completed + completed_at: "2026-04-28T22:30:00Z" + decisions: + - "API group: validation.sei.io (separate from sei.io)" + - "Discriminator field: spec.type (not spec.kind)" + - "chain.validators + chain.fullNodes: both required, type SeiNodeDeploymentSpec" + - "endpointPolicy field dropped; fullNodes-fleet endpoints always used" + - "Reserved env vars rejected at admission via CEL XValidation (not silent override)" + - "Failed vs Error distinct phase semantics; heartbeat alert ignores Error" + - "Single Succeeded condition (Tekton-style) + TestComplete condition (workload-completion boundary)" + - "Spec immutability via CEL self == oldSelf on substantive fields" + - "query.threshold typed as string with regex CEL" + - "Mode field dropped; continuous-only; runProperties.interval ships in v1 (default 30s)" + - "runProperties.stopOnFailure implemented in v1 (default false)" + - "S3 prefix s3://harbor-validation-results/{namespace}/{job}/{runId}/ locked" + - "No Flux labels on owned children (controller invariant)" + - "Tenant pre-provisions ServiceAccounts; controller never IAM" + - "alert.ruleRef cross-namespace allowed only into label-allowlisted namespaces" + - ".status.report.raw dropped entirely; .status.report.s3Url only" + - "Workload exit codes 1/2 aligned with Phase 1 contract" + - "validation_run_terminal_total{namespace, name, verdict} cardinality exception with downgrade path" + architectural_refinements: + - "Plan task count collapsed 10 → 7 (monitor-run combines wait-job-terminal + resolve-rules + evaluate-rules; mark-done absorbs collect-report's S3 URL stamp)" + - "monitor-run is the central polling task — single async loop, idempotent state in .status, cooperative Job cancel on stopOnFailure" + - "Conditions[TestComplete] as first-class status condition at workload-completion boundary" + - "Per-rule status gains lastEvaluatedAt + nextEvaluationAt for idempotent polling resumption" + + - name: "LLD revision pass + capture" + status: in_progress + started_at: "2026-04-28T22:30:00Z" + plan: "kubernetes-specialist revises /tmp/lld-draft.md with gate decisions; output to /tmp/lld-final.md; /design captures into docs/design/validation-run-lld.md" + + - name: "Cross-review" + status: pending + blocked_by: "LLD captured in repo" + plan: "all four specialists sign off; resolve findings before close" + + - name: "Implementation handoff issues — companion sub-issues" + status: completed + completed_at: "2026-04-29T16:00:00Z" + outputs: + - "sei-protocol/sei-k8s-controller#144 — SND readiness includes catching_up" + - "sei-protocol/sei-k8s-controller#145 — SND admission validation rejects genesis on fullNode" + - "sei-protocol/sei-k8s-controller#146 — TaskPlan.TargetPhase generalization decision" + - "sei-protocol/sei-k8s-controller#147 — status-patch optimistic concurrency invariant" + - "sei-protocol/platform#243 — PodMonitor for sei-k8s-controller" + - "sei-protocol/platform#244 — heartbeat PrometheusRule on consecutive failures" + - "sei-protocol/platform#245 — label monitoring/ namespace with validation-shared-rules" + notes: | + The 7 companion sub-issues from the LLD's Open Dependencies section + are filed as independent workstreams. Implementation handoff issues + for the controllers themselves (types + deepcopy, reconcilers, + planner integration) will be filed when PR #143 review locks + direction. + + - name: "Implementation handoff — controller code" + status: pending + blocked_by: "PR #143 review feedback locks design direction" + plan: "file ~6 controller implementation issues once LLD is approved: types+deepcopy, ValidationOrchestrationReconciler, ValidationLoadGenerationReconciler, planner builders for ValidationOrchestrationPlan and ValidationLoadGenerationPlan, monitor-task-completion task, RBAC + opt-in deployment plumbing" + + - name: "Close council" + status: in_progress + plan: "verify all PR #143 review-blocking acceptance criteria from #139 are met OR explicitly tracked; archive workstream" + +outstanding_findings: [] +escalations: [] + +acceptance_criteria_from_issue_139: + - "LLD merged in docs/design/ — IN PROGRESS (PR pending)" + - "LLD answers six open questions from OSS survey — DONE" + - "LLD enumerates five one-way-door warnings and how design avoids each — DONE" + - "Cross-review sign-offs from all four specialists — PENDING" + - "Design serves both Phase 1 consumers (seiload + qa-testing) — DONE" + - "Workload contract from platform#235 referenced as authoritative spec.workload envelope — DONE" + - "Implementation handoff issues filed — PENDING" + +constraints: + - "CRD field names are one-way doors per CLAUDE.md — must clear cross-review + user approval — RESOLVED" + - "Existing planner architecture is load-bearing — reuse plan-driven reconciler pattern — DONE" + - "OSS conventions take precedence over novelty — DONE" + - "Shadow-replayer stays out — not open for re-litigation in this council — DONE" diff --git a/api/v1alpha1/common_types.go b/api/v1alpha1/common_types.go index 4016cbb..cc4c9dd 100644 --- a/api/v1alpha1/common_types.go +++ b/api/v1alpha1/common_types.go @@ -199,40 +199,6 @@ type SidecarConfig struct { TLS *SidecarTLSSpec `json:"tls,omitempty"` } -// CosmosExporterConfig enables an in-pod sei-cosmos-exporter container -// that polls seid's local Cosmos gRPC + Tendermint RPC and exposes -// Prometheus metrics on a fixed node-local port. Presence of this -// struct (even empty) enables the container; absence leaves the pod -// alone. -// -// Designed as a sidecar in the same pod so the exporter's -// localhost-bound RPC assumption holds, per-pod identity (`pod` label) -// flows through into metrics, and Karpenter consolidation never -// collides on the listen port (which a DaemonSet would). -// -// The exporter always binds the upstream default port (9300) and the -// container exposes it as the named port `cosmos-metrics`. Platform -// PodMonitors target the named port, so per-node port overrides have -// no use — the field is intentionally absent here. -type CosmosExporterConfig struct { - // Image overrides the cosmos-exporter container image. When unset, - // the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE - // platform default (mirrors how Sidecar.Image works). Primarily a - // debug/canary knob — production deployments rely on the platform - // default so image bumps are a single env-var change on the - // operator Deployment. - // +optional - Image string `json:"image,omitempty"` - - // Resources defines CPU/memory requests and limits for the - // cosmos-exporter container. When unset, the controller applies - // scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem - // limit, no CPU limit — CPU limits turn scrape-time pulls into - // visible scrape gaps under throttling). - // +optional - Resources *corev1.ResourceRequirements `json:"resources,omitempty"` -} - // SidecarTLSSpec configures the cert-manager-issued serving cert for // the kube-rbac-proxy fronting. type SidecarTLSSpec struct { diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 324f43f..384935b 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -45,14 +45,10 @@ type SeiNodeSpec struct { // +optional Sidecar *SidecarConfig `json:"sidecar,omitempty"` - // CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod - // sidecar container scraping seid's local Cosmos gRPC + Tendermint - // RPC and exposing Prometheus metrics on :9300. The companion - // PodMonitor (emitted by the SeiNodeDeployment controller when its - // .spec.monitoring.cosmosExporter is also set) discovers and - // scrapes these endpoints. + // CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. + // Image and resources are operator-controlled; this is a toggle. // +optional - CosmosExporter *CosmosExporterConfig `json:"cosmosExporter,omitempty"` + CosmosExporter bool `json:"cosmosExporter,omitempty"` // PodLabels are additional labels merged into the StatefulSet pod template. // The controller always sets sei.io/node; these are additive and applied diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 8e30744..efa7970 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -31,26 +31,6 @@ func (in *ArchiveSpec) DeepCopy() *ArchiveSpec { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *CosmosExporterConfig) DeepCopyInto(out *CosmosExporterConfig) { - *out = *in - if in.Resources != nil { - in, out := &in.Resources, &out.Resources - *out = new(v1.ResourceRequirements) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CosmosExporterConfig. -func (in *CosmosExporterConfig) DeepCopy() *CosmosExporterConfig { - if in == nil { - return nil - } - out := new(CosmosExporterConfig) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *DataVolumeImport) DeepCopyInto(out *DataVolumeImport) { *out = *in @@ -930,11 +910,6 @@ func (in *SeiNodeSpec) DeepCopyInto(out *SeiNodeSpec) { *out = new(SidecarConfig) (*in).DeepCopyInto(*out) } - if in.CosmosExporter != nil { - in, out := &in.CosmosExporter, &out.CosmosExporter - *out = new(CosmosExporterConfig) - (*in).DeepCopyInto(*out) - } if in.PodLabels != nil { in, out := &in.PodLabels, &out.PodLabels *out = make(map[string]string, len(*in)) diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 60a8523..963ec3b 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -227,88 +227,9 @@ spec: type: string cosmosExporter: description: |- - CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod - sidecar container scraping seid's local Cosmos gRPC + Tendermint - RPC and exposing Prometheus metrics on :9300. The companion - PodMonitor (emitted by the SeiNodeDeployment controller when its - .spec.monitoring.cosmosExporter is also set) discovers and - scrapes these endpoints. - properties: - image: - description: |- - Image overrides the cosmos-exporter container image. When unset, - the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE - platform default (mirrors how Sidecar.Image works). Primarily a - debug/canary knob — production deployments rely on the platform - default so image bumps are a single env-var change on the - operator Deployment. - type: string - resources: - description: |- - Resources defines CPU/memory requests and limits for the - cosmos-exporter container. When unset, the controller applies - scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem - limit, no CPU limit — CPU limits turn scrape-time pulls into - visible scrape gaps under throttling). - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This field depends on the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry - in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - type: object + CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. + Image and resources are operator-controlled; this is a toggle. + type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 79bb1b6..d465abe 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -89,87 +89,9 @@ spec: type: string cosmosExporter: description: |- - CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod - sidecar container scraping seid's local Cosmos gRPC + Tendermint - RPC and exposing Prometheus metrics on :9300. The companion - PodMonitor (emitted by the SeiNodeDeployment controller when its - .spec.monitoring.cosmosExporter is also set) discovers and - scrapes these endpoints. - properties: - image: - description: |- - Image overrides the cosmos-exporter container image. When unset, - the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE - platform default (mirrors how Sidecar.Image works). Primarily a - debug/canary knob — production deployments rely on the platform - default so image bumps are a single env-var change on the - operator Deployment. - type: string - resources: - description: |- - Resources defines CPU/memory requests and limits for the - cosmos-exporter container. When unset, the controller applies - scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem - limit, no CPU limit — CPU limits turn scrape-time pulls into - visible scrape gaps under throttling). - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This field depends on the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - type: object + CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. + Image and resources are operator-controlled; this is a toggle. + type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 41a6b1c..00e44d0 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -26,18 +26,12 @@ const ( // NodeLabel is the standard label key used on all SeiNode-owned resources. NodeLabel = "sei.io/node" - // chainLabel is the per-pod label carrying the SeiNode's ChainID. - // Platform-owned ServiceMonitors and PodMonitors lift this into the - // `chain_id` metric label via __meta_kubernetes_pod_label_sei_io_chain - // relabeling. NOT in the StatefulSet selector — chain doesn't change - // on a running SeiNode and pod-template-only stamping keeps the - // selector immutable. + // chainLabel and roleLabel are observability identity labels lifted + // into `chain_id` and `component` metric labels by platform-owned + // (Pod|Service)Monitor relabelings. Pod-template only, never in the + // StatefulSet selector. chainLabel = "sei.io/chain" - - // roleLabel is the per-pod label carrying the node mode. Values: - // validator, archive, replayer, node. Same lift-via-relabel pattern - // as chainLabel; surfaces as the `component` metric label. - roleLabel = "sei.io/role" + roleLabel = "sei.io/role" dataDir = platform.DataDir @@ -94,17 +88,10 @@ const ( sidecarNonRootUID int64 = 65532 // defaultCosmosExporterPort matches sei-cosmos-exporter's upstream - // default (main.go:302) and the legacy EC2 - // install_sei_cosmos_exporter.sh:9300. Fixed — no per-node override: - // the platform PodMonitor selects via the named port - // `cosmos-metrics`, so the value is opaque to consumers. + // default. Platform PodMonitors target the named port `cosmos-metrics`. defaultCosmosExporterPort int32 = 9300 - // cosmosExporterScrapeLabel is the pod label stamped on every - // pod opting into cosmos-exporter scraping. A single cluster-wide - // PodMonitor in the platform repo (clusters//monitoring/) - // selects on this label and discovers all opted-in pods across - // namespaces. + // cosmosExporterScrapeLabel is the platform PodMonitor's selector. cosmosExporterScrapeLabel = "monitoring.sei.io/cosmos-exporter" cosmosExporterScrapeLabelValue = "enabled" ) @@ -130,42 +117,25 @@ func SelectorLabels(node *seiv1alpha1.SeiNode) map[string]string { } // ResourceLabels returns labels for the StatefulSet pod template. -// User-provided podLabels are applied first; system labels (sei.io/node -// and observability discovery labels) are set last so they cannot be -// overridden. +// User-provided podLabels are applied first; system labels win. func ResourceLabels(node *seiv1alpha1.SeiNode) map[string]string { labels := make(map[string]string, len(node.Spec.PodLabels)+4) maps.Copy(labels, node.Spec.PodLabels) labels[NodeLabel] = node.Name - // sei.io/chain and sei.io/role are observability identity labels: - // platform-owned PodMonitors and ServiceMonitors lift them into - // metric labels (chain_id, component) via - // __meta_kubernetes_pod_label_* relabeling instead of carrying the - // values in their own YAML. Stamped unconditionally so every seid - // pod is queryable by chain and role regardless of which exporter - // is enabled. if node.Spec.ChainID != "" { labels[chainLabel] = node.Spec.ChainID } if role := deriveRole(node); role != "" { labels[roleLabel] = role } - // Platform-owned discovery label: a single PodMonitor per chain - // in the observability stack selects on this and scrapes the - // cosmos-exporter sidecar's /metrics/* endpoints. Stamping the - // label here keeps the per-pod opt-in concern in the controller - // while the scrape policy stays in the platform repo (no per-SND - // PodMonitor reconcile loop). if CosmosExporterEnabled(node) { labels[cosmosExporterScrapeLabel] = cosmosExporterScrapeLabelValue } return labels } -// deriveRole returns the role label value for the node mode, matching -// the values nodedeployment.deriveComponent emits for ServiceMonitor -// relabeling. Centralized here so the pod-label and the relabel-output -// stay in lock-step. Empty string when no mode is set. +// deriveRole mirrors nodedeployment.deriveComponent so the pod label +// and the ServiceMonitor relabel-output stay in lock-step. func deriveRole(node *seiv1alpha1.SeiNode) string { switch { case node.Spec.Validator != nil: @@ -519,7 +489,7 @@ func buildNodePodSpec(node *seiv1alpha1.SeiNode, p PlatformConfig) (corev1.PodSp spec.InitContainers = initContainers containers := []corev1.Container{buildSidecarMainContainer(node, p)} if CosmosExporterEnabled(node) { - ceContainer, err := buildCosmosExporterContainer(node, p) + ceContainer, err := buildCosmosExporterContainer(p) if err != nil { return corev1.PodSpec{}, err } @@ -635,26 +605,13 @@ func buildSidecarMainContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) core } // CosmosExporterEnabled reports whether the SeiNode opts into running -// the sei-cosmos-exporter sidecar via spec.cosmosExporter (any non-nil -// value, even an empty struct, enables it). +// the sei-cosmos-exporter sidecar. func CosmosExporterEnabled(node *seiv1alpha1.SeiNode) bool { - return node.Spec.CosmosExporter != nil -} - -func cosmosExporterImage(node *seiv1alpha1.SeiNode, p PlatformConfig) string { - if node.Spec.CosmosExporter != nil && node.Spec.CosmosExporter.Image != "" { - return node.Spec.CosmosExporter.Image - } - return p.CosmosExporterImage + return node.Spec.CosmosExporter } -// defaultCosmosExporterResources returns scrape-friendly defaults. No -// CPU limit on purpose: cosmos-exporter pulls from seid's local gRPC -// on every scrape, and CPU throttling at the limit boundary turns those -// pulls into visible scrape gaps. Memory ceiling caps blast radius. -// 384Mi target headroom for /metrics/validators paginated delegation -// pulls on large-validator-set chains (Limit defaults to 1000 in -// upstream cosmos-exporter at main.go:305). +// defaultCosmosExporterResources: no CPU limit — cosmos-exporter calls +// seid's gRPC on every scrape; throttling turns into visible scrape gaps. func defaultCosmosExporterResources() corev1.ResourceRequirements { return corev1.ResourceRequirements{ Requests: corev1.ResourceList{ @@ -667,59 +624,35 @@ func defaultCosmosExporterResources() corev1.ResourceRequirements { } } -// buildCosmosExporterContainer renders the sei-cosmos-exporter sidecar -// that polls seid's local Cosmos gRPC (localhost:9090) + Tendermint RPC -// (http://localhost:26657) and exposes Prometheus metrics on :9300. -// Discovered by a platform-owned PodMonitor selecting on the -// monitoring.sei.io/cosmos-exporter pod label. -// -// Args track sei-infra's legacy systemd unit -// (sei-cosmos-exporter/install-sei-cosmos-exporter.sh): usei bond denom, -// 1_000_000 coefficient (6-decimal display), `sei` bech32 prefix. -// -// Returns an error when no image source is configured (neither the -// per-node Image override nor the platform-level SEI_COSMOS_EXPORTER_IMAGE -// env): an empty Image field would surface as a confusing ErrImagePull -// at pod scheduling time. Mirrors the KubeRBACProxyImage fail-fast -// pattern in GenerateStatefulSet. -func buildCosmosExporterContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) (corev1.Container, error) { - image := cosmosExporterImage(node, p) - if image == "" { - return corev1.Container{}, fmt.Errorf("cosmos-exporter image is required: set SEI_COSMOS_EXPORTER_IMAGE on the operator Deployment or override .spec.cosmosExporter.image on the SeiNode") +// buildCosmosExporterContainer renders the cosmos-exporter sidecar. +// Image, args, port, and resources are fixed — no per-node knobs. +func buildCosmosExporterContainer(p PlatformConfig) (corev1.Container, error) { + if p.CosmosExporterImage == "" { + return corev1.Container{}, fmt.Errorf("SEI_COSMOS_EXPORTER_IMAGE is required on the operator Deployment when any SeiNode sets spec.cosmosExporter: true") } - c := corev1.Container{ + return corev1.Container{ Name: containerNameCosmosExporter, - Image: image, + Image: p.CosmosExporterImage, Args: []string{ "--denom", "usei", "--denom-coefficient", "1000000", "--bech-prefix", "sei", "--listen-address", fmt.Sprintf(":%d", defaultCosmosExporterPort), - // --node defaults to localhost:9090 (Cosmos gRPC). - // --tendermint-rpc defaults to http://localhost:26657. - // Both are pod-local because the exporter runs in the same - // pod as seid; no override needed. + // --node and --tendermint-rpc default to localhost; the + // exporter shares the pod's net ns with seid. }, Ports: []corev1.ContainerPort{ {Name: "cosmos-metrics", ContainerPort: defaultCosmosExporterPort, Protocol: corev1.ProtocolTCP}, }, SecurityContext: sidecarSecurityContext(), Resources: defaultCosmosExporterResources(), - // Distroless + ReadOnlyRootFilesystem means /tmp would EROFS if - // the Go runtime or gRPC stack ever stages there. Reuse the - // sidecar-tmp emptyDir already in the pod's volume list at - // buildNodePodSpec — cheap insurance. + // /tmp: distroless + ReadOnlyRootFilesystem EROFS insurance. VolumeMounts: []corev1.VolumeMount{ {Name: sidecarTmpVolumeName, MountPath: "/tmp"}, }, - // cosmos-exporter calls setChainID() + setDenom() against seid's - // gRPC at startup (main.go:162-163) and log.Fatal()s on first - // dial failure (main.go:159). Without a startup gate, the - // exporter crash-loops until seid's gRPC is up — cosmetic but - // noisy in dashboards. TCP-probe seid's gRPC port and treat - // that as our readiness-to-start signal. seiconfig.PortGRPC is - // unconditionally exposed by buildNodeMainContainer via - // ContainerPorts(). + // cosmos-exporter Fatal()s on its initial gRPC dial. Gate + // startup on seid's gRPC port so we don't crash-loop until + // seid is up. StartupProbe: &corev1.Probe{ ProbeHandler: corev1.ProbeHandler{ TCPSocket: &corev1.TCPSocketAction{ @@ -730,11 +663,7 @@ func buildCosmosExporterContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) ( PeriodSeconds: 5, FailureThreshold: 60, }, - } - if node.Spec.CosmosExporter != nil && node.Spec.CosmosExporter.Resources != nil { - c.Resources = *node.Spec.CosmosExporter.Resources - } - return c, nil + }, nil } func sidecarWaitCommand(node *seiv1alpha1.SeiNode) (command []string, args []string) { diff --git a/internal/noderesource/noderesource_test.go b/internal/noderesource/noderesource_test.go index f5e320a..6ae4338 100644 --- a/internal/noderesource/noderesource_test.go +++ b/internal/noderesource/noderesource_test.go @@ -1307,7 +1307,7 @@ func TestCosmosExporter_AbsentByDefault(t *testing.T) { func TestCosmosExporter_PresentWhenOptedIn(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) @@ -1319,7 +1319,7 @@ func TestCosmosExporter_PresentWhenOptedIn(t *testing.T) { func TestCosmosExporter_DefaultImage(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1327,35 +1327,10 @@ func TestCosmosExporter_DefaultImage(t *testing.T) { g.Expect(ce.Image).To(Equal(platformtest.Config().CosmosExporterImage)) } -func TestCosmosExporter_CustomImageOverride(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{Image: "custom/exporter:v9"} - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) - - g.Expect(ce.Image).To(Equal("custom/exporter:v9")) -} - -func TestCosmosExporter_DefaultPort(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) - - g.Expect(ce.Ports).To(HaveLen(1)) - g.Expect(ce.Ports[0].ContainerPort).To(Equal(int32(9300))) - g.Expect(ce.Ports[0].Name).To(Equal("cosmos-metrics")) - g.Expect(ce.Args).To(ContainElement(":9300")) -} - func TestCosmosExporter_PortIsFixed(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1369,20 +1344,20 @@ func TestCosmosExporter_PortIsFixed(t *testing.T) { func TestCosmosExporter_ErrorWhenImageUnset(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true cfg := platformtest.Config() cfg.CosmosExporterImage = "" _, err := GenerateStatefulSet(node, cfg) g.Expect(err).To(HaveOccurred()) - g.Expect(err.Error()).To(ContainSubstring("cosmos-exporter image is required")) + g.Expect(err.Error()).To(ContainSubstring("SEI_COSMOS_EXPORTER_IMAGE is required")) } func TestCosmosExporter_StartupProbeOnSeidGRPC(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1397,7 +1372,7 @@ func TestCosmosExporter_StartupProbeOnSeidGRPC(t *testing.T) { func TestCosmosExporter_MountsTmpEmptyDir(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1415,7 +1390,7 @@ func TestCosmosExporter_MountsTmpEmptyDir(t *testing.T) { func TestCosmosExporter_SeiArgs(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1430,7 +1405,7 @@ func TestCosmosExporter_SeiArgs(t *testing.T) { func TestCosmosExporter_DefaultResources(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1447,25 +1422,6 @@ func TestCosmosExporter_DefaultResources(t *testing.T) { g.Expect(hasCPULimit).To(BeFalse()) } -func TestCosmosExporter_CustomResourcesOverride(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{ - Resources: &corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("100m"), - corev1.ResourceMemory: resource.MustParse("128Mi"), - }, - }, - } - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) - - cpuReq := ce.Resources.Requests[corev1.ResourceCPU] - g.Expect(cpuReq.String()).To(Equal("100m")) -} - func TestResourceLabels_ChainAndRoleStampedUnconditionally(t *testing.T) { g := NewWithT(t) tests := []struct { @@ -1532,7 +1488,7 @@ func TestCosmosExporter_PodLabelAbsentByDefault(t *testing.T) { func TestCosmosExporter_PodLabelPresentWhenOptedIn(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) @@ -1542,7 +1498,7 @@ func TestCosmosExporter_PodLabelPresentWhenOptedIn(t *testing.T) { func TestCosmosExporter_PodLabelNotOnSelector(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) @@ -1555,7 +1511,7 @@ func TestCosmosExporter_PodLabelNotOnSelector(t *testing.T) { func TestCosmosExporter_NonRootSecurityContext(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = &seiv1alpha1.CosmosExporterConfig{} + node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) diff --git a/internal/platform/platform.go b/internal/platform/platform.go index 5f3e4e1..a7f2976 100644 --- a/internal/platform/platform.go +++ b/internal/platform/platform.go @@ -52,12 +52,8 @@ type Config struct { KubeRBACProxyImage string SidecarImage string - // CosmosExporterImage is the default sei-cosmos-exporter image used - // for the in-pod cosmos-exporter sidecar. Optional: only required - // when at least one SeiNode opts in via .spec.cosmosExporter. Empty - // values cause buildCosmosExporterContainer to fall back to a - // per-node Spec.CosmosExporter.Image override; if neither is set - // the controller refuses to build the container. + // CosmosExporterImage is the sei-cosmos-exporter sidecar image. + // Required when any SeiNode sets spec.cosmosExporter: true. CosmosExporterImage string } diff --git a/internal/platform/platformtest/config.go b/internal/platform/platformtest/config.go index 0497c69..a43891c 100644 --- a/internal/platform/platformtest/config.go +++ b/internal/platform/platformtest/config.go @@ -34,8 +34,6 @@ func Config() platform.Config { // Arbitrary fixture; not authoritative. Production digest is set // via SEI_SIDECAR_IMAGE in the platform repo's controller Deployment. SidecarImage: "ghcr.io/sei-protocol/seictl@sha256:a2af4e1b8ed4c12661a3c98cce050bae3f292cc7560abc2ba98fd7dfc80d9be5", - // Production digest is set via SEI_COSMOS_EXPORTER_IMAGE in the - // platform repo's controller Deployment. CosmosExporterImage: "ghcr.io/sei-protocol/sei-cosmos-exporter@sha256:0000000000000000000000000000000000000000000000000000000000000000", } } diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 60a8523..963ec3b 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -227,88 +227,9 @@ spec: type: string cosmosExporter: description: |- - CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod - sidecar container scraping seid's local Cosmos gRPC + Tendermint - RPC and exposing Prometheus metrics on :9300. The companion - PodMonitor (emitted by the SeiNodeDeployment controller when its - .spec.monitoring.cosmosExporter is also set) discovers and - scrapes these endpoints. - properties: - image: - description: |- - Image overrides the cosmos-exporter container image. When unset, - the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE - platform default (mirrors how Sidecar.Image works). Primarily a - debug/canary knob — production deployments rely on the platform - default so image bumps are a single env-var change on the - operator Deployment. - type: string - resources: - description: |- - Resources defines CPU/memory requests and limits for the - cosmos-exporter container. When unset, the controller applies - scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem - limit, no CPU limit — CPU limits turn scrape-time pulls into - visible scrape gaps under throttling). - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This field depends on the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry - in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - type: object + CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. + Image and resources are operator-controlled; this is a toggle. + type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 79bb1b6..d465abe 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -89,87 +89,9 @@ spec: type: string cosmosExporter: description: |- - CosmosExporter, if set, runs sei-cosmos-exporter as an in-pod - sidecar container scraping seid's local Cosmos gRPC + Tendermint - RPC and exposing Prometheus metrics on :9300. The companion - PodMonitor (emitted by the SeiNodeDeployment controller when its - .spec.monitoring.cosmosExporter is also set) discovers and - scrapes these endpoints. - properties: - image: - description: |- - Image overrides the cosmos-exporter container image. When unset, - the controller falls back to the SEI_COSMOS_EXPORTER_IMAGE - platform default (mirrors how Sidecar.Image works). Primarily a - debug/canary knob — production deployments rely on the platform - default so image bumps are a single env-var change on the - operator Deployment. - type: string - resources: - description: |- - Resources defines CPU/memory requests and limits for the - cosmos-exporter container. When unset, the controller applies - scrape-friendly defaults (50m CPU req, 64Mi mem req, 384Mi mem - limit, no CPU limit — CPU limits turn scrape-time pulls into - visible scrape gaps under throttling). - properties: - claims: - description: |- - Claims lists the names of resources, defined in spec.resourceClaims, - that are used by this container. - - This field depends on the - DynamicResourceAllocation feature gate. - - This field is immutable. It can only be set for containers. - items: - description: ResourceClaim references one entry in PodSpec.ResourceClaims. - properties: - name: - description: |- - Name must match the name of one entry in pod.spec.resourceClaims of - the Pod where this field is used. It makes that resource available - inside a container. - type: string - request: - description: |- - Request is the name chosen for a request in the referenced claim. - If empty, everything from the claim is made available, otherwise - only the result of this request. - type: string - required: - - name - type: object - type: array - x-kubernetes-list-map-keys: - - name - x-kubernetes-list-type: map - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Limits describes the maximum amount of compute resources allowed. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: |- - Requests describes the minimum amount of compute resources required. - If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, - otherwise to an implementation-defined value. Requests cannot exceed Limits. - More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - type: object - type: object - type: object + CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. + Image and resources are operator-controlled; this is a toggle. + type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. From 9e65b1a1d5398a9c74bf810502daea284106b21a Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 15 May 2026 15:49:21 -0700 Subject: [PATCH 3/5] chore: gitignore .council/ session tooling (was accidentally committed in prior commit) --- .council/workstream.yaml | 127 --------------------------------------- .gitignore | 1 + 2 files changed, 1 insertion(+), 127 deletions(-) delete mode 100644 .council/workstream.yaml diff --git a/.council/workstream.yaml b/.council/workstream.yaml deleted file mode 100644 index f7b70fb..0000000 --- a/.council/workstream.yaml +++ /dev/null @@ -1,127 +0,0 @@ -workstream: - description: "ValidationRun CRD LLD — design pass for ephemeral-chain validation workloads on Harbor" - tier: component - started: "2026-04-28T20:00:00Z" - updated: "2026-04-28T22:30:00Z" - source_issue: "https://github.com/sei-protocol/sei-k8s-controller/issues/139" - companion_issue: "https://github.com/sei-protocol/platform/issues/235" - decision_posture: "design pass — implementation is separate workstreams" - -phases: - - name: "Foundation" - status: completed - completed_at: "2026-04-28T20:00:00Z" - outputs: - - ".council/workstream.yaml" - - - name: "Round 1 — Parallel specialist input gathering" - status: completed - completed_at: "2026-04-28T20:30:00Z" - outputs: - - "/tmp/round1/pm-scope-cuts.md" - - "/tmp/round1/otel-rule-semantics.md" - - "/tmp/round1/platform-integration.md" - - - name: "Round 2 — LLD draft synthesis (kubernetes-specialist)" - status: completed - completed_at: "2026-04-28T21:00:00Z" - outputs: - - "/tmp/lld-draft.md (~1300 lines, 8500 words)" - - - name: "Coral mid-council escalation — Argo Workflows pivot question" - status: completed - completed_at: "2026-04-28T22:00:00Z" - outputs: - - "Unanimous Path X (custom CRD) recommendation from all four specialists" - - "Argo un-defer triggers documented for Future Work section" - notes: | - User asked late in design pass whether to abandon custom CRD for Argo - Workflows. Engaged all four specialists in parallel. Unanimous: Path X. - Load-bearing reason: Argo can't natively own non-Pod CRDs cascade-style; - SND-as-cascading-child is central organizing primitive. Continuous-mode - polling makes Argo's per-step pod cost prohibitive (300 pods/run worst - case). Argo not currently deployed on Harbor — adoption is a multi-week - platform commitment. Re-open conversation on documented triggers. - - - name: "One-way-door gate — CRD field name approvals" - status: completed - completed_at: "2026-04-28T22:30:00Z" - decisions: - - "API group: validation.sei.io (separate from sei.io)" - - "Discriminator field: spec.type (not spec.kind)" - - "chain.validators + chain.fullNodes: both required, type SeiNodeDeploymentSpec" - - "endpointPolicy field dropped; fullNodes-fleet endpoints always used" - - "Reserved env vars rejected at admission via CEL XValidation (not silent override)" - - "Failed vs Error distinct phase semantics; heartbeat alert ignores Error" - - "Single Succeeded condition (Tekton-style) + TestComplete condition (workload-completion boundary)" - - "Spec immutability via CEL self == oldSelf on substantive fields" - - "query.threshold typed as string with regex CEL" - - "Mode field dropped; continuous-only; runProperties.interval ships in v1 (default 30s)" - - "runProperties.stopOnFailure implemented in v1 (default false)" - - "S3 prefix s3://harbor-validation-results/{namespace}/{job}/{runId}/ locked" - - "No Flux labels on owned children (controller invariant)" - - "Tenant pre-provisions ServiceAccounts; controller never IAM" - - "alert.ruleRef cross-namespace allowed only into label-allowlisted namespaces" - - ".status.report.raw dropped entirely; .status.report.s3Url only" - - "Workload exit codes 1/2 aligned with Phase 1 contract" - - "validation_run_terminal_total{namespace, name, verdict} cardinality exception with downgrade path" - architectural_refinements: - - "Plan task count collapsed 10 → 7 (monitor-run combines wait-job-terminal + resolve-rules + evaluate-rules; mark-done absorbs collect-report's S3 URL stamp)" - - "monitor-run is the central polling task — single async loop, idempotent state in .status, cooperative Job cancel on stopOnFailure" - - "Conditions[TestComplete] as first-class status condition at workload-completion boundary" - - "Per-rule status gains lastEvaluatedAt + nextEvaluationAt for idempotent polling resumption" - - - name: "LLD revision pass + capture" - status: in_progress - started_at: "2026-04-28T22:30:00Z" - plan: "kubernetes-specialist revises /tmp/lld-draft.md with gate decisions; output to /tmp/lld-final.md; /design captures into docs/design/validation-run-lld.md" - - - name: "Cross-review" - status: pending - blocked_by: "LLD captured in repo" - plan: "all four specialists sign off; resolve findings before close" - - - name: "Implementation handoff issues — companion sub-issues" - status: completed - completed_at: "2026-04-29T16:00:00Z" - outputs: - - "sei-protocol/sei-k8s-controller#144 — SND readiness includes catching_up" - - "sei-protocol/sei-k8s-controller#145 — SND admission validation rejects genesis on fullNode" - - "sei-protocol/sei-k8s-controller#146 — TaskPlan.TargetPhase generalization decision" - - "sei-protocol/sei-k8s-controller#147 — status-patch optimistic concurrency invariant" - - "sei-protocol/platform#243 — PodMonitor for sei-k8s-controller" - - "sei-protocol/platform#244 — heartbeat PrometheusRule on consecutive failures" - - "sei-protocol/platform#245 — label monitoring/ namespace with validation-shared-rules" - notes: | - The 7 companion sub-issues from the LLD's Open Dependencies section - are filed as independent workstreams. Implementation handoff issues - for the controllers themselves (types + deepcopy, reconcilers, - planner integration) will be filed when PR #143 review locks - direction. - - - name: "Implementation handoff — controller code" - status: pending - blocked_by: "PR #143 review feedback locks design direction" - plan: "file ~6 controller implementation issues once LLD is approved: types+deepcopy, ValidationOrchestrationReconciler, ValidationLoadGenerationReconciler, planner builders for ValidationOrchestrationPlan and ValidationLoadGenerationPlan, monitor-task-completion task, RBAC + opt-in deployment plumbing" - - - name: "Close council" - status: in_progress - plan: "verify all PR #143 review-blocking acceptance criteria from #139 are met OR explicitly tracked; archive workstream" - -outstanding_findings: [] -escalations: [] - -acceptance_criteria_from_issue_139: - - "LLD merged in docs/design/ — IN PROGRESS (PR pending)" - - "LLD answers six open questions from OSS survey — DONE" - - "LLD enumerates five one-way-door warnings and how design avoids each — DONE" - - "Cross-review sign-offs from all four specialists — PENDING" - - "Design serves both Phase 1 consumers (seiload + qa-testing) — DONE" - - "Workload contract from platform#235 referenced as authoritative spec.workload envelope — DONE" - - "Implementation handoff issues filed — PENDING" - -constraints: - - "CRD field names are one-way doors per CLAUDE.md — must clear cross-review + user approval — RESOLVED" - - "Existing planner architecture is load-bearing — reuse plan-driven reconciler pattern — DONE" - - "OSS conventions take precedence over novelty — DONE" - - "Shadow-replayer stays out — not open for re-litigation in this council — DONE" diff --git a/.gitignore b/.gitignore index 9dffc16..81c9a0b 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ Thumbs.db # kubebuilder test assets are cached locally # but shouldn't be committed bin/k8s/ +.council/ From ba3cb9f1fd8ee603db61e92ad97a07f307f761cb Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 15 May 2026 15:58:18 -0700 Subject: [PATCH 4/5] refactor: cosmos-exporter is always-on; drop toggle + discovery label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Legacy sei-infra ran sei-cosmos-exporter unconditionally on every seid EC2 host via install-exporters.sh. Parity: every K8s seid pod gets it too. The per-node opt-in was over-engineered for a uniform fleet concern. - Drop SeiNodeSpec.CosmosExporter bool — no user-facing toggle - Drop CosmosExporterEnabled() — inline as unconditional append - Drop monitoring.sei.io/cosmos-exporter discovery label — redundant when every seid pod has the exporter. Platform PodMonitor selects per chain via sei.io/chain (and the named container port `cosmos-metrics`) which already discriminates seid pods from other workloads - Ephemeral-chain cardinality concern is handled at the platform PodMonitor's `chain_id` allow-list, not at per-pod opt-in - SEI_COSMOS_EXPORTER_IMAGE is now an unconditional requirement on the operator Deployment — failing fast across all SeiNodes is the right blast radius when the env is unset Also: gofmt, role-value constants (roleValidator/Archive/Replayer/FullNode), and sidecarTmpMountPath constant for the /tmp mount — addresses goconst lint surface my changes touched. --- api/v1alpha1/seinode_types.go | 5 -- config/crd/sei.io_seinodedeployments.yaml | 5 -- config/crd/sei.io_seinodes.yaml | 5 -- internal/noderesource/noderesource.go | 70 ++++++++++----------- internal/noderesource/noderesource_test.go | 71 +++------------------- internal/platform/platformtest/config.go | 2 +- manifests/sei.io_seinodedeployments.yaml | 5 -- manifests/sei.io_seinodes.yaml | 5 -- 8 files changed, 42 insertions(+), 126 deletions(-) diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 384935b..e8ad503 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -45,11 +45,6 @@ type SeiNodeSpec struct { // +optional Sidecar *SidecarConfig `json:"sidecar,omitempty"` - // CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. - // Image and resources are operator-controlled; this is a toggle. - // +optional - CosmosExporter bool `json:"cosmosExporter,omitempty"` - // PodLabels are additional labels merged into the StatefulSet pod template. // The controller always sets sei.io/node; these are additive and applied // first so that system labels take precedence. diff --git a/config/crd/sei.io_seinodedeployments.yaml b/config/crd/sei.io_seinodedeployments.yaml index 963ec3b..329ad8d 100644 --- a/config/crd/sei.io_seinodedeployments.yaml +++ b/config/crd/sei.io_seinodedeployments.yaml @@ -225,11 +225,6 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string - cosmosExporter: - description: |- - CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. - Image and resources are operator-controlled; this is a toggle. - type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index d465abe..dad2a1c 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -87,11 +87,6 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string - cosmosExporter: - description: |- - CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. - Image and resources are operator-controlled; this is a toggle. - type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/internal/noderesource/noderesource.go b/internal/noderesource/noderesource.go index 00e44d0..0dd68e1 100644 --- a/internal/noderesource/noderesource.go +++ b/internal/noderesource/noderesource.go @@ -33,6 +33,11 @@ const ( chainLabel = "sei.io/chain" roleLabel = "sei.io/role" + roleValidator = "validator" + roleArchive = "archive" + roleReplayer = "replayer" + roleFullNode = "node" + dataDir = platform.DataDir // homeVarRef is the K8s VariableReference form of HOME, substituted from @@ -49,16 +54,16 @@ const ( // Pod-spec container names. Used as both the .Name on built containers // and the lookup key for the operator-keyring containment guard. - containerNameSeid = "seid" - containerNameSidecar = "sei-sidecar" - containerNameRBACProxy = "kube-rbac-proxy" - containerNameCosmosExporter = "cosmos-exporter" - servicePortNameAPI = "api" - rbacProxyConfigVolumeName = "rbac-proxy-config" - sidecarTLSVolumeName = "sidecar-tls" - rbacProxyConfigMountPath = "/etc/kube-rbac-proxy" - sidecarTLSMountPath = "/etc/tls" - RBACProxyPort int32 = 8443 + containerNameSeid = "seid" + containerNameSidecar = "sei-sidecar" + containerNameRBACProxy = "kube-rbac-proxy" + containerNameCosmosExporter = "cosmos-exporter" + servicePortNameAPI = "api" + rbacProxyConfigVolumeName = "rbac-proxy-config" + sidecarTLSVolumeName = "sidecar-tls" + rbacProxyConfigMountPath = "/etc/kube-rbac-proxy" + sidecarTLSMountPath = "/etc/tls" + RBACProxyPort int32 = 8443 pathHealthz = "/v0/healthz" pathLivez = "/v0/livez" @@ -81,6 +86,7 @@ const ( // sidecarTmpVolumeName backs an emptyDir at /tmp — required because the // sidecar runs with ReadOnlyRootFilesystem and Go stdlib defaults to /tmp. sidecarTmpVolumeName = "sidecar-tmp" + sidecarTmpMountPath = "/tmp" // sidecarNonRootUID is the nonroot UID/GID baked into distroless and // chainguard static-debian12 base images. Pod-level fsGroup matches so @@ -90,10 +96,6 @@ const ( // defaultCosmosExporterPort matches sei-cosmos-exporter's upstream // default. Platform PodMonitors target the named port `cosmos-metrics`. defaultCosmosExporterPort int32 = 9300 - - // cosmosExporterScrapeLabel is the platform PodMonitor's selector. - cosmosExporterScrapeLabel = "monitoring.sei.io/cosmos-exporter" - cosmosExporterScrapeLabelValue = "enabled" ) // PlatformConfig is an alias for platform.Config. @@ -119,7 +121,7 @@ func SelectorLabels(node *seiv1alpha1.SeiNode) map[string]string { // ResourceLabels returns labels for the StatefulSet pod template. // User-provided podLabels are applied first; system labels win. func ResourceLabels(node *seiv1alpha1.SeiNode) map[string]string { - labels := make(map[string]string, len(node.Spec.PodLabels)+4) + labels := make(map[string]string, len(node.Spec.PodLabels)+3) maps.Copy(labels, node.Spec.PodLabels) labels[NodeLabel] = node.Name if node.Spec.ChainID != "" { @@ -128,9 +130,6 @@ func ResourceLabels(node *seiv1alpha1.SeiNode) map[string]string { if role := deriveRole(node); role != "" { labels[roleLabel] = role } - if CosmosExporterEnabled(node) { - labels[cosmosExporterScrapeLabel] = cosmosExporterScrapeLabelValue - } return labels } @@ -139,13 +138,13 @@ func ResourceLabels(node *seiv1alpha1.SeiNode) map[string]string { func deriveRole(node *seiv1alpha1.SeiNode) string { switch { case node.Spec.Validator != nil: - return "validator" + return roleValidator case node.Spec.Archive != nil: - return "archive" + return roleArchive case node.Spec.Replayer != nil: - return "replayer" + return roleReplayer case node.Spec.FullNode != nil: - return "node" + return roleFullNode } return "" } @@ -487,15 +486,14 @@ func buildNodePodSpec(node *seiv1alpha1.SeiNode, p PlatformConfig) (corev1.PodSp initContainers = append(initContainers, buildRBACProxyContainer(node, p)) } spec.InitContainers = initContainers - containers := []corev1.Container{buildSidecarMainContainer(node, p)} - if CosmosExporterEnabled(node) { - ceContainer, err := buildCosmosExporterContainer(p) - if err != nil { - return corev1.PodSpec{}, err - } - containers = append(containers, ceContainer) + ceContainer, err := buildCosmosExporterContainer(p) + if err != nil { + return corev1.PodSpec{}, err + } + spec.Containers = []corev1.Container{ + buildSidecarMainContainer(node, p), + ceContainer, } - spec.Containers = containers return spec, nil } @@ -540,7 +538,7 @@ func buildSidecarContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) corev1.C mounts := make([]corev1.VolumeMount, 0, 2+len(keyringMounts)) mounts = append(mounts, corev1.VolumeMount{Name: "data", MountPath: dataDir}, - corev1.VolumeMount{Name: sidecarTmpVolumeName, MountPath: "/tmp"}, + corev1.VolumeMount{Name: sidecarTmpVolumeName, MountPath: sidecarTmpMountPath}, ) mounts = append(mounts, keyringMounts...) @@ -604,12 +602,6 @@ func buildSidecarMainContainer(node *seiv1alpha1.SeiNode, p PlatformConfig) core return container } -// CosmosExporterEnabled reports whether the SeiNode opts into running -// the sei-cosmos-exporter sidecar. -func CosmosExporterEnabled(node *seiv1alpha1.SeiNode) bool { - return node.Spec.CosmosExporter -} - // defaultCosmosExporterResources: no CPU limit — cosmos-exporter calls // seid's gRPC on every scrape; throttling turns into visible scrape gaps. func defaultCosmosExporterResources() corev1.ResourceRequirements { @@ -628,7 +620,7 @@ func defaultCosmosExporterResources() corev1.ResourceRequirements { // Image, args, port, and resources are fixed — no per-node knobs. func buildCosmosExporterContainer(p PlatformConfig) (corev1.Container, error) { if p.CosmosExporterImage == "" { - return corev1.Container{}, fmt.Errorf("SEI_COSMOS_EXPORTER_IMAGE is required on the operator Deployment when any SeiNode sets spec.cosmosExporter: true") + return corev1.Container{}, fmt.Errorf("SEI_COSMOS_EXPORTER_IMAGE is required on the operator Deployment") } return corev1.Container{ Name: containerNameCosmosExporter, @@ -648,7 +640,7 @@ func buildCosmosExporterContainer(p PlatformConfig) (corev1.Container, error) { Resources: defaultCosmosExporterResources(), // /tmp: distroless + ReadOnlyRootFilesystem EROFS insurance. VolumeMounts: []corev1.VolumeMount{ - {Name: sidecarTmpVolumeName, MountPath: "/tmp"}, + {Name: sidecarTmpVolumeName, MountPath: sidecarTmpMountPath}, }, // cosmos-exporter Fatal()s on its initial gRPC dial. Gate // startup on seid's gRPC port so we don't crash-loop until diff --git a/internal/noderesource/noderesource_test.go b/internal/noderesource/noderesource_test.go index 6ae4338..cccaa14 100644 --- a/internal/noderesource/noderesource_test.go +++ b/internal/noderesource/noderesource_test.go @@ -101,7 +101,7 @@ func TestResourceLabelsForNode_DefaultsToSystemLabels(t *testing.T) { g.Expect(labels).To(Equal(map[string]string{ NodeLabel: "snap-0", "sei.io/chain": "sei-test", - "sei.io/role": "node", + "sei.io/role": roleFullNode, })) } @@ -116,10 +116,10 @@ func TestResourceLabelsForNode_MergesPodLabels(t *testing.T) { g.Expect(labels).To(Equal(map[string]string{ NodeLabel: "snap-0", - "sei.io/chain": "sei-test", - "sei.io/role": "node", - "sei.io/nodedeployment": "my-group", - "team": "platform", + "sei.io/chain": "sei-test", + "sei.io/role": roleFullNode, + "sei.io/nodedeployment": "my-group", + "team": "platform", })) } @@ -1294,23 +1294,12 @@ func TestGenerateStatefulSet_ProductionPodSpec_PassesGuard(t *testing.T) { // --- Cosmos exporter --- -func TestCosmosExporter_AbsentByDefault(t *testing.T) { +func TestCosmosExporter_AlwaysPresent(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - g.Expect(findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter)).To(BeNil()) - g.Expect(sts.Spec.Template.Spec.Containers).To(HaveLen(1)) -} - -func TestCosmosExporter_PresentWhenOptedIn(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) g.Expect(ce).NotTo(BeNil()) g.Expect(sts.Spec.Template.Spec.Containers).To(HaveLen(2)) @@ -1319,7 +1308,6 @@ func TestCosmosExporter_PresentWhenOptedIn(t *testing.T) { func TestCosmosExporter_DefaultImage(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1330,7 +1318,6 @@ func TestCosmosExporter_DefaultImage(t *testing.T) { func TestCosmosExporter_PortIsFixed(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1344,7 +1331,6 @@ func TestCosmosExporter_PortIsFixed(t *testing.T) { func TestCosmosExporter_ErrorWhenImageUnset(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true cfg := platformtest.Config() cfg.CosmosExporterImage = "" @@ -1357,7 +1343,6 @@ func TestCosmosExporter_ErrorWhenImageUnset(t *testing.T) { func TestCosmosExporter_StartupProbeOnSeidGRPC(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1372,14 +1357,13 @@ func TestCosmosExporter_StartupProbeOnSeidGRPC(t *testing.T) { func TestCosmosExporter_MountsTmpEmptyDir(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) var hasTmp bool for _, m := range ce.VolumeMounts { - if m.Name == sidecarTmpVolumeName && m.MountPath == "/tmp" { + if m.Name == sidecarTmpVolumeName && m.MountPath == sidecarTmpMountPath { hasTmp = true break } @@ -1390,7 +1374,6 @@ func TestCosmosExporter_MountsTmpEmptyDir(t *testing.T) { func TestCosmosExporter_SeiArgs(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1405,7 +1388,6 @@ func TestCosmosExporter_SeiArgs(t *testing.T) { func TestCosmosExporter_DefaultResources(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) @@ -1429,9 +1411,9 @@ func TestResourceLabels_ChainAndRoleStampedUnconditionally(t *testing.T) { mutate func(*seiv1alpha1.SeiNode) expected string }{ - {"validator", func(n *seiv1alpha1.SeiNode) { n.Spec.Validator = &seiv1alpha1.ValidatorSpec{} }, "validator"}, - {"archive", func(n *seiv1alpha1.SeiNode) { n.Spec.Archive = &seiv1alpha1.ArchiveSpec{} }, "archive"}, - {"fullNode", func(n *seiv1alpha1.SeiNode) { n.Spec.FullNode = &seiv1alpha1.FullNodeSpec{} }, "node"}, + {"validator", func(n *seiv1alpha1.SeiNode) { n.Spec.Validator = &seiv1alpha1.ValidatorSpec{} }, roleValidator}, + {"archive", func(n *seiv1alpha1.SeiNode) { n.Spec.Archive = &seiv1alpha1.ArchiveSpec{} }, roleArchive}, + {"fullNode", func(n *seiv1alpha1.SeiNode) { n.Spec.FullNode = &seiv1alpha1.FullNodeSpec{} }, roleFullNode}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -1476,42 +1458,9 @@ func TestResourceLabels_NotInSelector(t *testing.T) { g.Expect(sts.Spec.Selector.MatchLabels).To(HaveLen(1)) } -func TestCosmosExporter_PodLabelAbsentByDefault(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - - g.Expect(sts.Spec.Template.Labels).NotTo(HaveKey("monitoring.sei.io/cosmos-exporter")) -} - -func TestCosmosExporter_PodLabelPresentWhenOptedIn(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - - g.Expect(sts.Spec.Template.Labels).To(HaveKeyWithValue("monitoring.sei.io/cosmos-exporter", "enabled")) -} - -func TestCosmosExporter_PodLabelNotOnSelector(t *testing.T) { - g := NewWithT(t) - node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true - - sts := mustGenerateStatefulSet(t, node, platformtest.Config()) - - // The scrape label must NOT live in the immutable StatefulSet - // selector — otherwise toggling cosmos-exporter on or off would - // require StatefulSet recreation. - g.Expect(sts.Spec.Selector.MatchLabels).NotTo(HaveKey("monitoring.sei.io/cosmos-exporter")) -} - func TestCosmosExporter_NonRootSecurityContext(t *testing.T) { g := NewWithT(t) node := newSnapshotNode("ce-0", "default") - node.Spec.CosmosExporter = true sts := mustGenerateStatefulSet(t, node, platformtest.Config()) ce := findContainer(sts.Spec.Template.Spec.Containers, containerNameCosmosExporter) diff --git a/internal/platform/platformtest/config.go b/internal/platform/platformtest/config.go index a43891c..fdc97a4 100644 --- a/internal/platform/platformtest/config.go +++ b/internal/platform/platformtest/config.go @@ -33,7 +33,7 @@ func Config() platform.Config { KubeRBACProxyImage: "quay.io/brancz/kube-rbac-proxy:v0.19.1", // Arbitrary fixture; not authoritative. Production digest is set // via SEI_SIDECAR_IMAGE in the platform repo's controller Deployment. - SidecarImage: "ghcr.io/sei-protocol/seictl@sha256:a2af4e1b8ed4c12661a3c98cce050bae3f292cc7560abc2ba98fd7dfc80d9be5", + SidecarImage: "ghcr.io/sei-protocol/seictl@sha256:a2af4e1b8ed4c12661a3c98cce050bae3f292cc7560abc2ba98fd7dfc80d9be5", CosmosExporterImage: "ghcr.io/sei-protocol/sei-cosmos-exporter@sha256:0000000000000000000000000000000000000000000000000000000000000000", } } diff --git a/manifests/sei.io_seinodedeployments.yaml b/manifests/sei.io_seinodedeployments.yaml index 963ec3b..329ad8d 100644 --- a/manifests/sei.io_seinodedeployments.yaml +++ b/manifests/sei.io_seinodedeployments.yaml @@ -225,11 +225,6 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string - cosmosExporter: - description: |- - CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. - Image and resources are operator-controlled; this is a toggle. - type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index d465abe..dad2a1c 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -87,11 +87,6 @@ spec: description: ChainID of the chain this node belongs to. minLength: 1 type: string - cosmosExporter: - description: |- - CosmosExporter runs sei-cosmos-exporter as an in-pod sidecar. - Image and resources are operator-controlled; this is a toggle. - type: boolean dataVolume: description: |- DataVolume configures the data PersistentVolumeClaim for this node. From 2fdd8484c211e009455f0f39ad29b0b30e8e3667 Mon Sep 17 00:00:00 2001 From: bdchatham Date: Fri, 15 May 2026 16:01:51 -0700 Subject: [PATCH 5/5] =?UTF-8?q?chore:=20fix=20goconst=20lint=20=E2=80=94?= =?UTF-8?q?=20use=20node.Name=20in=20label-test=20assertions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI lint (only-new-issues) flagged the snap-0 literal in TestResourceLabelsForNode_* assertions as a new goconst regression. Reference node.Name from the fixture instead — cleaner anyway. --- internal/noderesource/noderesource_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/noderesource/noderesource_test.go b/internal/noderesource/noderesource_test.go index cccaa14..709c58b 100644 --- a/internal/noderesource/noderesource_test.go +++ b/internal/noderesource/noderesource_test.go @@ -99,7 +99,7 @@ func TestResourceLabelsForNode_DefaultsToSystemLabels(t *testing.T) { // newSnapshotNode sets ChainID="sei-test" + FullNode mode, so chain // + role labels are stamped alongside sei.io/node. g.Expect(labels).To(Equal(map[string]string{ - NodeLabel: "snap-0", + NodeLabel: node.Name, "sei.io/chain": "sei-test", "sei.io/role": roleFullNode, })) @@ -115,7 +115,7 @@ func TestResourceLabelsForNode_MergesPodLabels(t *testing.T) { labels := ResourceLabels(node) g.Expect(labels).To(Equal(map[string]string{ - NodeLabel: "snap-0", + NodeLabel: node.Name, "sei.io/chain": "sei-test", "sei.io/role": roleFullNode, "sei.io/nodedeployment": "my-group",