From 172af74d16eeda4c77a7bee51c1e12604b9be4a0 Mon Sep 17 00:00:00 2001 From: Luigi Toscano Date: Fri, 29 May 2026 09:22:40 +0200 Subject: [PATCH] [ci_dcn_site] Add retry logic to Nova aggregate creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add retry logic (10 attempts, 30s delay) to the Nova aggregate creation task to handle transient MessageDeliveryFailure errors during RabbitMQ restarts or queue rebalancing. This aligns with the existing defensive coding pattern used throughout the ci_dcn_site role, where similar k8s_exec and Kubernetes API operations already include retry logic (see pre-ceph.yml, post-ceph.yml, etc.). Root cause: DataPlaneDeployment triggers RabbitMQ queue rebalance during DCN deployment, causing rolling restarts. Nova aggregate creation can fail with MessageDeliveryFailure if attempted during this window. This patch provides reactive recovery through retries. Total retry time is up to 5 minutes (10 × 30s), which covers typical RabbitMQ restart windows observed in CI. Co-Authored-By: Claude Sonnet 4.5 Related-Issue: DCN deployment failure with MessageDeliveryFailure Signed-off-by: Luigi Toscano --- roles/ci_dcn_site/tasks/az.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/roles/ci_dcn_site/tasks/az.yml b/roles/ci_dcn_site/tasks/az.yml index 0909f1dcc..ef4982819 100644 --- a/roles/ci_dcn_site/tasks/az.yml +++ b/roles/ci_dcn_site/tasks/az.yml @@ -32,6 +32,10 @@ | list }} - name: Create AZ if it does not exist + register: aggregate_create + retries: 10 + delay: 30 + until: aggregate_create.rc == 0 when: - az_hosts.rc == 1 kubernetes.core.k8s_exec: @@ -40,6 +44,7 @@ pod: openstackclient command: >- openstack aggregate create {{ _az }} --zone {{ _az }} + failed_when: aggregate_create.rc != 0 and aggregate_create.attempts >= 10 - name: Add only the missing edpm hosts to AZ ignore_errors: true