From 8f4fe8e7e2c80368a8d40c3a1ce8f9eedfaf4076 Mon Sep 17 00:00:00 2001 From: Heeyoung Jung Date: Mon, 8 Jun 2026 12:51:31 -0700 Subject: [PATCH] Resolve concurrency problem causing skipping taint by surfacing error and retry with SQS Co-authored-by: mcornea mcornea@redhat.com --- pkg/interruptionevent/draincordon/handler.go | 13 ++++++++++++- pkg/interruptionevent/internal/common/handler.go | 3 ++- pkg/monitor/sqsevent/spot-itn-event.go | 2 +- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pkg/interruptionevent/draincordon/handler.go b/pkg/interruptionevent/draincordon/handler.go index a36388bf8..6337a6690 100644 --- a/pkg/interruptionevent/draincordon/handler.go +++ b/pkg/interruptionevent/draincordon/handler.go @@ -81,7 +81,18 @@ func (h *Handler) HandleEvent(drainEvent *monitor.InterruptionEvent) error { } if drainEvent.PreDrainTask != nil { - h.commonHandler.RunPreDrainTask(nodeName, drainEvent) + if err := h.commonHandler.RunPreDrainTask(nodeName, drainEvent); err != nil { + log.Err(err).Str("nodeName", nodeName).Msg("Pre-drain task failed; aborting to allow SQS retry") + h.commonHandler.InterruptionEventStore.CancelInterruptionEvent(drainEvent.EventID) + + // If the node is missing and the user opted for DeleteSqsMsgIfNodeNotFound then delete the SQS message + if !nodeFound && h.commonHandler.NthConfig.DeleteSqsMsgIfNodeNotFound && drainEvent.PostDrainTask != nil { + h.commonHandler.RunPostDrainTask(nodeName, drainEvent) + return nil + } + + return err + } } podNameList, err := h.commonHandler.Node.FetchPodNameList(nodeName) diff --git a/pkg/interruptionevent/internal/common/handler.go b/pkg/interruptionevent/internal/common/handler.go index 023289da5..50a64217d 100644 --- a/pkg/interruptionevent/internal/common/handler.go +++ b/pkg/interruptionevent/internal/common/handler.go @@ -44,7 +44,7 @@ func (h *Handler) GetNodeName(drainEvent *monitor.InterruptionEvent) (string, er return nodeName, nil } -func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { +func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) error { err := drainEvent.PreDrainTask(*drainEvent, h.Node) if err != nil { log.Err(err).Msg("There was a problem executing the pre-drain task") @@ -53,6 +53,7 @@ func (h *Handler) RunPreDrainTask(nodeName string, drainEvent *monitor.Interrupt h.Recorder.Emit(nodeName, observability.Normal, observability.PreDrainReason, observability.PreDrainMsg) } h.Metrics.NodeActionsInc("pre-drain", nodeName, drainEvent.EventID, err) + return err } func (h *Handler) RunCancelDrainTask(nodeName string, drainEvent *monitor.InterruptionEvent) { diff --git a/pkg/monitor/sqsevent/spot-itn-event.go b/pkg/monitor/sqsevent/spot-itn-event.go index a5f21e1e0..4be877947 100644 --- a/pkg/monitor/sqsevent/spot-itn-event.go +++ b/pkg/monitor/sqsevent/spot-itn-event.go @@ -96,7 +96,7 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEven if err != nil { log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID) } - return nil + return err } return &interruptionEvent, nil }