From 37d554e875d42ba1f2a5fa4758632975f4ec0fe3 Mon Sep 17 00:00:00 2001 From: gais-ameer-rh Date: Fri, 15 May 2026 15:42:18 +0530 Subject: [PATCH 01/22] [DZ-Storage] Add tests for cinderBackups spec.cinder.template.cinderBackup (singluar) in DZ-Storage DT is replaced with cinderBackups (plural) to deploy multiple cinder backups based on AZ topology. hooks/playbooks/dz_storage_cinder_backups.yaml validates the behaviour of cinderBackups. The playbook tests different scenarios of cinder backup creation and restoring the backups across availability zones. Jira: OSPRH-28342 Signed-off-by: Gais Ameer Co-Authored-By: Claude Sonnet 4.5 --- .../playbooks/dz_storage_cinder_backups.yaml | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 hooks/playbooks/dz_storage_cinder_backups.yaml diff --git a/hooks/playbooks/dz_storage_cinder_backups.yaml b/hooks/playbooks/dz_storage_cinder_backups.yaml new file mode 100644 index 000000000..7dbcec0eb --- /dev/null +++ b/hooks/playbooks/dz_storage_cinder_backups.yaml @@ -0,0 +1,201 @@ +--- +# Test Cinder backup and restore capabilities across availability zones in dz-storage DT +# Tests three scenarios: +# 1. AZ1 backs up to AZ1 and restores to AZ1 +# 2. AZ1 backs up to AZ2 and restores to AZ1 +# 3. AZ1 backs up to AZ2 and restores to AZ2 + +- name: Test Cinder backup and restore across availability zones + hosts: "{{ cifmw_target_host | default('localhost') }}" + environment: + KUBECONFIG: "{{ cifmw_openshift_kubeconfig | default('/home/' + ansible_user | default('zuul') + '/.kube/config') }}" + PATH: "{{ cifmw_path | default(ansible_env.PATH) }}" + tasks: + # ================================================================================== + # Scenario 1: AZ1 backs up to AZ1 and restores to AZ1 + # ================================================================================== + - name: "Scenario 1: Create volume in AZ1" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume create --size 1 vol-az1 --availability-zone az1 -f value -c id + register: vol_az1_id + + - name: Wait for volume vol-az1 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1 -f value -c status + register: vol_az1_status + until: "'available' in vol_az1_status.stdout" + retries: 60 + delay: 5 + + - name: "Scenario 1: Create backup for volume in AZ1" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack --os-volume-api-version 3.51 volume backup create + --availability-zone az1 + --name vol-az1-backup-az1 + vol-az1 + -f value -c id + register: vol_az1_backup_az1_id + + - name: Wait for backup vol-az1-backup-az1 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume backup show vol-az1-backup-az1 -f value -c status + register: vol_az1_backup_az1_status + until: "'available' in vol_az1_backup_az1_status.stdout" + retries: 120 + delay: 10 + + - name: Verify backup vol-az1-backup-az1 is in AZ1 + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume backup show vol-az1-backup-az1 -f value -c availability_zone + register: vol_az1_backup_az1_zone + failed_when: "'az1' not in vol_az1_backup_az1_zone.stdout" + + - name: "Scenario 1: Restore volume backup to AZ1" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack --os-volume-api-version 3.47 volume create + --backup vol-az1-backup-az1 + --availability-zone az1 + vol-az1-restore-az1 + -f value -c id + register: vol_az1_restore_az1_id + + - name: Wait for restored volume vol-az1-restore-az1 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-restore-az1 -f value -c status + register: vol_az1_restore_az1_status + until: "'available' in vol_az1_restore_az1_status.stdout" + retries: 120 + delay: 10 + + - name: Verify restored volume vol-az1-restore-az1 is in AZ1 + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-restore-az1 -f value -c availability_zone + register: vol_az1_restore_az1_zone + failed_when: "'az1' not in vol_az1_restore_az1_zone.stdout" + + # ================================================================================== + # Scenario 2: AZ1 backs up to AZ2 and restores to AZ1 + # ================================================================================== + - name: "Scenario 2: Create backup for volume in AZ2" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack --os-volume-api-version 3.51 volume backup create + --availability-zone az2 + --name vol-az1-backup-az2 + vol-az1 + -f value -c id + register: vol_az1_backup_az2_id + + - name: Wait for backup vol-az1-backup-az2 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume backup show vol-az1-backup-az2 -f value -c status + register: vol_az1_backup_az2_status + until: "'available' in vol_az1_backup_az2_status.stdout" + retries: 120 + delay: 10 + + - name: Verify backup vol-az1-backup-az2 is in AZ2 + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume backup show vol-az1-backup-az2 -f value -c availability_zone + register: vol_az1_backup_az2_zone + failed_when: "'az2' not in vol_az1_backup_az2_zone.stdout" + + - name: "Scenario 2: Restore backup from AZ2 back to AZ1" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack --os-volume-api-version 3.47 volume create + --backup vol-az1-backup-az2 + --availability-zone az1 + vol-az1-backup-az2-restore-az1 + -f value -c id + register: vol_az1_backup_az2_restore_az1_id + + - name: Wait for restored volume vol-az1-backup-az2-restore-az1 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-backup-az2-restore-az1 -f value -c status + register: vol_az1_backup_az2_restore_az1_status + until: "'available' in vol_az1_backup_az2_restore_az1_status.stdout" + retries: 120 + delay: 10 + + - name: Verify restored volume vol-az1-backup-az2-restore-az1 is in AZ1 + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-backup-az2-restore-az1 -f value -c availability_zone + register: vol_az1_backup_az2_restore_az1_zone + failed_when: "'az1' not in vol_az1_backup_az2_restore_az1_zone.stdout" + + # ================================================================================== + # Scenario 3: AZ1 backs up to AZ2 and restores to AZ2 + # ================================================================================== + # Reusing the volume backup vol-az1-backup-az2 created in previous scenario + - name: "Scenario 3: Restore backup from AZ2 to AZ2" + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack --os-volume-api-version 3.47 volume create + --backup vol-az1-backup-az2 + --availability-zone az2 + vol-az1-restore-az2 + -f value -c id + register: vol_az1_restore_az2_id + + - name: Wait for restored volume vol-az1-restore-az2 to become available + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-restore-az2 -f value -c status + register: vol_az1_restore_az2_status + until: "'available' in vol_az1_restore_az2_status.stdout" + retries: 120 + delay: 10 + + - name: Verify restored volume vol-az1-restore-az2 is in AZ2 + ansible.builtin.command: >- + oc rsh + -n {{ cifmw_openstack_namespace }} + openstackclient + openstack volume show vol-az1-restore-az2 -f value -c availability_zone + register: vol_az1_restore_az2_zone + failed_when: "'az2' not in vol_az1_restore_az2_zone.stdout" From 3f711d34d8f1242d26b89e644496b394a3741fdd Mon Sep 17 00:00:00 2001 From: John Fulton Date: Thu, 7 May 2026 12:17:34 -0400 Subject: [PATCH 02/22] [cephx_key] Add aes256k cipher support Add an optional `cipher` parameter (choices: aes, aes256k; default: aes) to the `cephx_key` Ansible module so CI jobs can generate AES-256k (32-byte, type=2) CephX keys. - Refactor __create_cephx_key() to accept cipher argument; use key_type=2 and os.urandom(32) for aes256k, key_type=1 and os.urandom(16) for aes (default, backward compatible). - Update DOCUMENTATION, EXAMPLES and RETURN docstrings. - Update the "Generate a cephx key" task in hooks/playbooks/ceph.yml to pass `cipher: "{{ cifmw_ceph_key_cipher | default('aes') }}"`, allowing scenarios to opt in via a single variable. - Add tests/unit/modules/test_cephx_key.py with 8 tests covering both cipher modes, invalid input, base64 validity, and key randomness. Jira: OSPRH-29667 Signed-off-by: John Fulton Co-Authored-By: Claude Sonnet 4.6 --- hooks/playbooks/ceph.yml | 1 + plugins/modules/cephx_key.py | 56 +++++++++-- tests/unit/modules/test_cephx_key.py | 136 +++++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 10 deletions(-) create mode 100644 tests/unit/modules/test_cephx_key.py diff --git a/hooks/playbooks/ceph.yml b/hooks/playbooks/ceph.yml index 945be9f1f..13f5ce610 100644 --- a/hooks/playbooks/ceph.yml +++ b/hooks/playbooks/ceph.yml @@ -353,6 +353,7 @@ - name: Generate a cephx key cephx_key: + cipher: "{{ cifmw_ceph_key_cipher | default('aes') }}" register: cephx no_log: "{{ cifmw_nolog | default(true) | bool }}" diff --git a/plugins/modules/cephx_key.py b/plugins/modules/cephx_key.py index d84cff29e..da7f5946b 100644 --- a/plugins/modules/cephx_key.py +++ b/plugins/modules/cephx_key.py @@ -14,17 +14,38 @@ short_description: Generate a random CephX authentication key description: -- Generate a random CephX authentication key and return it +- Generate a random CephX authentication key and return it. +- Supports AES-128 (default, type=1, 16-byte key) and AES-256k (type=2, 32-byte key) ciphers. + +options: + cipher: + description: + - The cipher to use when generating the CephX key. + - Use C(aes) for AES-128 (16-byte key, 40-char base64, type=1). This is the default. + - Use C(aes256k) for AES-256k (32-byte key, 60-char base64, type=2). + type: str + default: aes + choices: [aes, aes256k] author: - John Fulton (@fultonj) """ EXAMPLES = r""" -- name: Generate a cephx key +- name: Generate a cephx key (AES-128, backward compatible default) cifmw.general.cephx_key: register: cephx +- name: Generate a cephx key with explicit AES-128 cipher + cifmw.general.cephx_key: + cipher: aes + register: cephx + +- name: Generate a cephx key with AES-256k cipher + cifmw.general.cephx_key: + cipher: aes256k + register: cephx + - name: Show cephx key debug: msg: "{{ cephx.key }}" @@ -32,11 +53,14 @@ RETURN = r""" key: - description: A random cephx authentication key - type: dict + description: + - A random CephX authentication key encoded as base64. + - AES-128 keys are 40 characters long (ending with ==). + - AES-256k keys are 60 characters long (ending with =). + type: str returned: success sample: - - KEY: AQC+vYNXgDAgAhAAc8UoYt+OTz5uhV7ItLdwUw== + - AQC+vYNXgDAgAhAAc8UoYt+OTz5uhV7ItLdwUw== """ @@ -47,21 +71,33 @@ import time -def __create_cephx_key(): +def __create_cephx_key(cipher="aes"): # NOTE(fultonj): Taken from # https://github.com/ceph/ceph-deploy/blob/master/ceph_deploy/new.py#L21 - key = os.urandom(16) - header = struct.pack(" Date: Thu, 7 May 2026 14:57:10 +0200 Subject: [PATCH 03/22] [reproducer] Wait for controller-0 SSH before configuring After controller-0 reboots, SSH may become transiently unreachable while background system initialization completes. The configure_controller block delegates tasks to controller-0 without verifying connectivity first, causing intermittent UNREACHABLE failures on the install_ca task. Add wait_for_connection at the start of the delegate_to block to ensure controller-0 is fully accessible before proceeding. Signed-off-by: Miguel Angel Nieto Jimenez Co-Authored-By: Claude Opus 4.6 (1M context) --- roles/reproducer/tasks/configure_controller.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/roles/reproducer/tasks/configure_controller.yml b/roles/reproducer/tasks/configure_controller.yml index 29b33814c..8347dbcb4 100644 --- a/roles/reproducer/tasks/configure_controller.yml +++ b/roles/reproducer/tasks/configure_controller.yml @@ -29,6 +29,11 @@ default('zuul') }} block: + - name: Wait for controller-0 SSH to be ready + ansible.builtin.wait_for_connection: + timeout: 300 + sleep: 5 + - name: Ensure directories exist ansible.builtin.file: path: "{{ cifmw_reproducer_controller_basedir }}/{{ item }}" From 2e487e9b6f6620ab65fd90028f8006e7067400a9 Mon Sep 17 00:00:00 2001 From: Daniel Pawlik Date: Mon, 11 May 2026 14:17:59 +0200 Subject: [PATCH 04/22] [build_containers] Fix podman_image push dest for FQDN image names The containers-built.log contains a list of images as registry/namespace/repository. The containers.podman.podman_image module treats a two-part dest (host/namespace) as a prefix and appends the entire local image name, which produced nested repo names on quay.rdoproject.org. Pass a complete dest (registry, namespace, repository basename, and tag) so Podman pushes to the intended repository. Commit also include a changes that remove calling {{ item }} when loop is used and reduce using `cat` command with `grep` - we can directly call `grep` without `cat`. One more change also done in this commit was to rename tasks that contains wrong script name, that makes confusion. Signed-off-by: Daniel Pawlik --- roles/build_containers/tasks/main.yml | 4 ++-- roles/build_containers/tasks/tag.yml | 31 ++++++++++++++++----------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/roles/build_containers/tasks/main.yml b/roles/build_containers/tasks/main.yml index f308f68e4..d34473b16 100644 --- a/roles/build_containers/tasks/main.yml +++ b/roles/build_containers/tasks/main.yml @@ -27,14 +27,14 @@ - name: Install tcib ansible.builtin.import_tasks: install.yml -- name: Generate container_build.sh script +- name: Generate build_containers.sh script ansible.builtin.template: src: templates/build_containers.sh.j2 dest: "{{ cifmw_build_containers_basedir }}/artifacts/build_containers.sh" mode: "0777" force: true -- name: Run container_build.sh script +- name: Run build_containers.sh script when: not cifmw_build_containers_run_hotfix | bool become: true environment: diff --git a/roles/build_containers/tasks/tag.yml b/roles/build_containers/tasks/tag.yml index eacc8bab5..9aa4b924f 100644 --- a/roles/build_containers/tasks/tag.yml +++ b/roles/build_containers/tasks/tag.yml @@ -16,13 +16,15 @@ - name: Ensure directories are present ansible.builtin.file: - path: "{{ cifmw_build_containers_basedir }}/{{ item }}" + path: "{{ cifmw_build_containers_basedir }}/{{ created_dir }}" state: directory mode: "0755" loop: - tmp - artifacts - logs + loop_control: + loop_var: created_dir - name: Make sure authfile exists when: @@ -56,8 +58,7 @@ ansible.builtin.shell: cmd: >- set -o pipefail; - cat {{ cifmw_build_containers_basedir }}/logs/containers-built.log | - grep {{ cifmw_build_containers_container_name_prefix }} | + grep {{ cifmw_build_containers_container_name_prefix }} {{ cifmw_build_containers_basedir }}/logs/containers-built.log | awk '{ print $1 }' register: built_images_from_file @@ -65,8 +66,7 @@ ansible.builtin.shell: cmd: >- set -o pipefail; - cat {{ cifmw_build_containers_basedir }}/logs/containers-built.log | - grep {{ cifmw_build_containers_container_name_prefix }} | + grep {{ cifmw_build_containers_container_name_prefix }} {{ cifmw_build_containers_basedir }}/logs/containers-built.log | awk '{ print $2 }' | head -n 1 register: images_tag_from_file @@ -81,23 +81,28 @@ - name: Pull images returned in built_images containers.podman.podman_image: - name: "{{ item }}" + name: "{{ loop_img_name }}" tag: "{{ images_tag }}" loop: "{{ built_images }}" + loop_control: + loop_var: loop_img_name - name: Retag the images with new tag containers.podman.podman_tag: - image: "{{ item }}:{{ images_tag }}" + image: "{{ loop_img_name }}:{{ images_tag }}" target_names: - - "{{ item }}:{{ cifmw_build_containers_tag_string }}" + - "{{ loop_img_name }}:{{ cifmw_build_containers_tag_string | trim }}" loop: "{{ built_images }}" + loop_control: + loop_var: loop_img_name - name: Push images to registry with new tag containers.podman.podman_image: - name: "{{ item }}" - push_args: - dest: "{{ cifmw_build_containers_push_registry }}/{{ cifmw_build_containers_registry_namespace }}" - tag: "{{ cifmw_build_containers_tag_string }}" - pull: false + name: "{{ loop_img_name }}" + tag: "{{ cifmw_build_containers_tag_string | trim }}" push: true + push_args: + dest: "{{ cifmw_build_containers_push_registry }}/{{ cifmw_build_containers_registry_namespace }}/{{ loop_img_name | basename }}:{{ cifmw_build_containers_tag_string | trim }}" loop: "{{ built_images }}" + loop_control: + loop_var: loop_img_name From 72b71c18234bde2645c040336446be548b7ecc9d Mon Sep 17 00:00:00 2001 From: Jan Jasek Date: Tue, 28 Apr 2026 12:56:38 +0200 Subject: [PATCH 05/22] [test_operator] Add additional parameter for horizontest This commit add extra parameter for horizontest in order to modify projecttext xpath based on upstream and downstream dashboard theme. Signed-off-by: Jan Jasek --- roles/test_operator/README.md | 1 + roles/test_operator/defaults/main.yml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/roles/test_operator/README.md b/roles/test_operator/README.md index 9983b9e18..bddba3ca7 100644 --- a/roles/test_operator/README.md +++ b/roles/test_operator/README.md @@ -166,6 +166,7 @@ Default value: {} * `cifmw_test_operator_horizontest_password`: (String) The password for the user running the Horizon tests. Default value: `horizontest` * `cifmw_test_operator_horizontest_project_name`: (String) The name of the OpenStack project for Horizon tests. Default value: `horizontest` * `cifmw_test_operator_horizontest_project_name_xpath`: (String) The xpath to select project name based on dashboard theme. Default value: `//span[@class='rcueicon rcueicon-folder-open']/ancestor::li` +* `cifmw_test_operator_horizontest_project_text_xpath`: (String) The xpath to element displaying current project based on dashboard theme. Default value: `.//span[@class='rcueicon rcueicon-folder-open']/ancestor::li` * `cifmw_test_operator_horizontest_registry`: (String) The registry where to pull horizontest container. Default value: `{{ cifmw_test_operator_default_registry }}` * `cifmw_test_operator_horizontest_repo_url`: (String) The Horizon tests repository URL. Default value: `https://review.opendev.org/openstack/horizon` * `cifmw_test_operator_horizontest_resources`: (Dict) A dictionary that specifies resources (cpu, memory) for the test pods. When kept untouched it defaults to the resource limits specified on the test-operator side. Default value: `{}` diff --git a/roles/test_operator/defaults/main.yml b/roles/test_operator/defaults/main.yml index c0a818ee5..878fcf9f7 100644 --- a/roles/test_operator/defaults/main.yml +++ b/roles/test_operator/defaults/main.yml @@ -300,6 +300,7 @@ cifmw_test_operator_horizontest_debug: false cifmw_test_operator_horizontest_horizon_test_dir: "/var/lib/horizontest" cifmw_test_operator_horizontest_extra_flag: "not pagination" cifmw_test_operator_horizontest_project_name_xpath: "//span[@class='rcueicon rcueicon-folder-open']/ancestor::li" +cifmw_test_operator_horizontest_project_text_xpath: ".//span[@class='rcueicon rcueicon-folder-open']/ancestor::li" cifmw_test_operator_horizontest_config: apiVersion: test.openstack.org/v1beta1 kind: HorizonTest @@ -326,6 +327,7 @@ cifmw_test_operator_horizontest_config: debug: "{{ stage_vars_dict.cifmw_test_operator_horizontest_debug }}" extraFlag: "{{ stage_vars_dict.cifmw_test_operator_horizontest_extra_flag }}" projectNameXpath: "{{ stage_vars_dict.cifmw_test_operator_horizontest_project_name_xpath }}" + projectTextXpath: "{{ stage_vars_dict.cifmw_test_operator_horizontest_project_text_xpath }}" horizonTestDir: "{{ stage_vars_dict.cifmw_test_operator_horizontest_horizon_test_dir }}" extraMounts: "{{ stage_vars_dict.cifmw_test_operator_horizontest_extra_mounts | default(omit) }}" resources: "{{ stage_vars_dict.cifmw_test_operator_horizontest_resources | default(omit) }}" From c4ad57c83f223e22bef5742705cc5e148b952267 Mon Sep 17 00:00:00 2001 From: Roberto Alfieri Date: Mon, 11 May 2026 14:58:40 +0200 Subject: [PATCH 06/22] [pcp_metrics] Make target hosts configurable via cifmw_pcp_metrics_hosts PCP hook playbooks hardcode hosts: all,!localhost, which causes ansible-playbook to exit rc=2 when compute nodes are unreachable (e.g. not yet provisioned or already torn down). This kills the CI job for a metrics side-effect. Introduce cifmw_pcp_metrics_hosts (default: all,!localhost) so that downstream jobs can exclude host groups that are expected to be unavailable. By not targeting those hosts at all, Ansible never encounters UNREACHABLE errors and exits cleanly. Co-Authored-By: Claude Signed-off-by: Roberto Alfieri --- hooks/playbooks/pcp-metrics-post.yml | 2 +- hooks/playbooks/pcp-metrics-pre.yml | 2 +- roles/pcp_metrics/defaults/main.yaml | 3 +++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/hooks/playbooks/pcp-metrics-post.yml b/hooks/playbooks/pcp-metrics-post.yml index 5423fa406..d188696e8 100644 --- a/hooks/playbooks/pcp-metrics-post.yml +++ b/hooks/playbooks/pcp-metrics-post.yml @@ -7,7 +7,7 @@ # The best place to call this hook is under post_tests actions. # - name: Collect performance metrics - hosts: all,!localhost + hosts: "{{ cifmw_pcp_metrics_hosts }}" gather_facts: false tasks: - name: Gather metrics diff --git a/hooks/playbooks/pcp-metrics-pre.yml b/hooks/playbooks/pcp-metrics-pre.yml index c2b007e32..15240aa71 100644 --- a/hooks/playbooks/pcp-metrics-pre.yml +++ b/hooks/playbooks/pcp-metrics-pre.yml @@ -24,7 +24,7 @@ tasks_from: repo - name: Start collecting performance metrics - hosts: all,!localhost + hosts: "{{ cifmw_pcp_metrics_hosts }}" gather_facts: false tasks: - name: Setup PCP diff --git a/roles/pcp_metrics/defaults/main.yaml b/roles/pcp_metrics/defaults/main.yaml index 1a9b61667..d8dd90545 100644 --- a/roles/pcp_metrics/defaults/main.yaml +++ b/roles/pcp_metrics/defaults/main.yaml @@ -4,6 +4,9 @@ pcp_metrics_setup: false pcp_metrics_gather: false pcp_metrics_plot: false +# Host pattern for PCP hook playbooks (setup and gather) +cifmw_pcp_metrics_hosts: "all,!localhost" + # Setup-related variables pcp_metrics_packages: - pcp # for pmlogger From 10d1c27921fc4c998bc6579482c0670ae6f3a0ce Mon Sep 17 00:00:00 2001 From: Miguel Angel Nieto Jimenez Date: Fri, 8 May 2026 12:59:29 +0200 Subject: [PATCH 07/22] [ci_gen_kustomize_values] fix: Preserve nodes config from architecture repo in all NFV templates Extend the fix from fffa7217 (HCI template) to all remaining NFV templates. Preserve the complete node configuration (ansibleHost, networks, fixedIP) from the architecture repository instead of overwriting it with just hostName. Affected templates: ovs-dpdk, ovs-dpdk-sriov, ovs-dpdk-sriov-ipv6, sriov, ovs-dpdk-sriov-2nodesets, ovs-dpdk-sriov-ipv6-2nodesets, and ovs-dpdk-sriov-networker. Signed-off-by: Miguel Angel Nieto Jimenez Co-Authored-By: Claude Opus 4.6 (1M context) --- .../edpm-nodeset-values/values.yaml.j2 | 3 ++- .../edpm-nodeset-values/values.yaml.j2 | 6 ++++-- .../edpm-nodeset2-values/values.yaml.j2 | 6 ++++-- .../edpm-nodeset-values/values.yaml.j2 | 6 ++++-- .../edpm-nodeset2-values/values.yaml.j2 | 6 ++++-- .../ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 | 6 ++++-- .../edpm-common-nodeset-values/values.yaml.j2 | 6 ++++-- .../ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 | 6 ++++-- .../templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 | 6 ++++-- .../templates/sriov/edpm-nodeset-values/values.yaml.j2 | 6 ++++-- 10 files changed, 38 insertions(+), 19 deletions(-) diff --git a/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 index 9f46f5c6b..879d949ff 100644 --- a/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 @@ -32,8 +32,9 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} {{ node_name }}: -{{ _original_nodes[node_name] | default({'hostName': instance}) | to_nice_yaml(indent=2) | indent(8, first=true) }} +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 index 3ddf4b7dc..9b75cc7e7 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 @@ -44,8 +44,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 index 6e2109bc5..8e6c5b7bb 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 @@ -44,8 +44,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 index 8fca35bd0..b166ce872 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 @@ -44,8 +44,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 index 80103686c..a3bb0e28e 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 @@ -44,8 +44,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 index 3c4278102..5f608e64c 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 @@ -31,8 +31,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 index 9961a6fe5..937e86f54 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 @@ -45,8 +45,10 @@ data: {% endif %} nodes: {% for instance in instance_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 index 110b4007e..ab1c12103 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 @@ -31,8 +31,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 index 131b9b350..7204c800e 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 @@ -31,8 +31,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and ('repo-setup' in ci_gen_kustomize_edpm_nodeset_predeployed_services) %} diff --git a/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 index 0eadba79f..e2d1aaed0 100644 --- a/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 @@ -31,8 +31,10 @@ data: {% endif %} nodes: {% for instance in instances_names %} - edpm-{{ instance }}: - hostName: {{ instance }} +{% set node_name = 'edpm-' + instance %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} + {{ node_name }}: +{{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} {% if ('repo-setup' not in (_original_nodeset['services'] | default([]))) and ('repo-setup' in ci_gen_kustomize_edpm_nodeset_predeployed_services) %} From c9b6df695edddc1fc80331dfefe69d785c915fde Mon Sep 17 00:00:00 2001 From: Michael Burke Date: Wed, 13 May 2026 10:08:33 -0400 Subject: [PATCH 08/22] [pcp_metrics] Fix broken hosts selection Because `cifmw_pcp_metrics_hosts` was defined in the pcp_metrics role's defaults file, the `pcp-metrics-pre.yml` playbook was trying use to this var before it was imported, leading to `'cifmw_pcp_metrics_hosts' is undefined` errors Signed-off-by: Michael Burke --- hooks/playbooks/pcp-metrics-post.yml | 2 +- hooks/playbooks/pcp-metrics-pre.yml | 2 +- roles/pcp_metrics/defaults/main.yaml | 3 --- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/hooks/playbooks/pcp-metrics-post.yml b/hooks/playbooks/pcp-metrics-post.yml index d188696e8..509085aa1 100644 --- a/hooks/playbooks/pcp-metrics-post.yml +++ b/hooks/playbooks/pcp-metrics-post.yml @@ -7,7 +7,7 @@ # The best place to call this hook is under post_tests actions. # - name: Collect performance metrics - hosts: "{{ cifmw_pcp_metrics_hosts }}" + hosts: "{{ cifmw_pcp_metrics_hosts | default('all,!localhost') }}" gather_facts: false tasks: - name: Gather metrics diff --git a/hooks/playbooks/pcp-metrics-pre.yml b/hooks/playbooks/pcp-metrics-pre.yml index 15240aa71..7e56fb63e 100644 --- a/hooks/playbooks/pcp-metrics-pre.yml +++ b/hooks/playbooks/pcp-metrics-pre.yml @@ -24,7 +24,7 @@ tasks_from: repo - name: Start collecting performance metrics - hosts: "{{ cifmw_pcp_metrics_hosts }}" + hosts: "{{ cifmw_pcp_metrics_hosts | default('all,!localhost') }}" gather_facts: false tasks: - name: Setup PCP diff --git a/roles/pcp_metrics/defaults/main.yaml b/roles/pcp_metrics/defaults/main.yaml index d8dd90545..1a9b61667 100644 --- a/roles/pcp_metrics/defaults/main.yaml +++ b/roles/pcp_metrics/defaults/main.yaml @@ -4,9 +4,6 @@ pcp_metrics_setup: false pcp_metrics_gather: false pcp_metrics_plot: false -# Host pattern for PCP hook playbooks (setup and gather) -cifmw_pcp_metrics_hosts: "all,!localhost" - # Setup-related variables pcp_metrics_packages: - pcp # for pmlogger From 7077c5ffad944540df0ca2b2db09032aeb30a2c4 Mon Sep 17 00:00:00 2001 From: nemarjan Date: Mon, 11 May 2026 11:34:43 +0100 Subject: [PATCH 09/22] [env_op_images] Add unit tests for verify_pulled_report_crio Add focused unit coverage for CRI-O pulled report verification, including successful enrichment, cross-node evidence accounting, and failure when no logs are provided. Reuse shared test utilities with a local fallback so tests run in both collection-style and local environments. Co-authored-by: Cursor Signed-off-by: nemarjan --- .../modules/verify_pulled_report_crio.py | 1 + .../modules}/verify_pulled_report_crio.py | 0 .../tasks/verify_pulled_report_crio.yml | 2 +- .../modules/test_verify_pulled_report_crio.py | 316 ++++++++++++++++++ 4 files changed, 318 insertions(+), 1 deletion(-) create mode 120000 ci/playbooks/collections/ansible_collections/cifmw/general/plugins/modules/verify_pulled_report_crio.py rename {roles/env_op_images/library => plugins/modules}/verify_pulled_report_crio.py (100%) create mode 100644 tests/unit/modules/test_verify_pulled_report_crio.py diff --git a/ci/playbooks/collections/ansible_collections/cifmw/general/plugins/modules/verify_pulled_report_crio.py b/ci/playbooks/collections/ansible_collections/cifmw/general/plugins/modules/verify_pulled_report_crio.py new file mode 120000 index 000000000..042438ec8 --- /dev/null +++ b/ci/playbooks/collections/ansible_collections/cifmw/general/plugins/modules/verify_pulled_report_crio.py @@ -0,0 +1 @@ +../../../../../../../../plugins/modules/verify_pulled_report_crio.py \ No newline at end of file diff --git a/roles/env_op_images/library/verify_pulled_report_crio.py b/plugins/modules/verify_pulled_report_crio.py similarity index 100% rename from roles/env_op_images/library/verify_pulled_report_crio.py rename to plugins/modules/verify_pulled_report_crio.py diff --git a/roles/env_op_images/tasks/verify_pulled_report_crio.yml b/roles/env_op_images/tasks/verify_pulled_report_crio.yml index 5d4547b9b..673a9a699 100644 --- a/roles/env_op_images/tasks/verify_pulled_report_crio.yml +++ b/roles/env_op_images/tasks/verify_pulled_report_crio.yml @@ -106,7 +106,7 @@ - name: Enrich pulled report with CRI-O evidence when: _verify_crio_log_files.matched | int > 0 - verify_pulled_report_crio: + cifmw.general.verify_pulled_report_crio: report_path: "{{ cifmw_env_op_images_pulled_report_path }}" log_dir: "{{ cifmw_env_op_images_crio_logs_dir }}" output_path: "{{ cifmw_env_op_images_verified_report_path }}" diff --git a/tests/unit/modules/test_verify_pulled_report_crio.py b/tests/unit/modules/test_verify_pulled_report_crio.py new file mode 100644 index 000000000..5c738354a --- /dev/null +++ b/tests/unit/modules/test_verify_pulled_report_crio.py @@ -0,0 +1,316 @@ +# Copyright: (c) 2025, Red Hat + +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +from __future__ import absolute_import, division, print_function + +import os +import tempfile + +import yaml + +from ansible_collections.cifmw.general.tests.unit.utils import ( + AnsibleExitJson, + AnsibleFailJson, + ModuleBaseTestCase, + set_module_args, +) +from ansible_collections.cifmw.general.plugins.modules import ( + verify_pulled_report_crio, +) + + +class TestVerifyPulledReportCrio(ModuleBaseTestCase): + def test_enriches_report_and_counts_cross_node(self): + """ + GIVEN a pulled-images report with two digest entries across two nodes + and CRI-O logs showing each image pulled on its own node + WHEN the module processes the report against the logs + THEN it enriches every image with log evidence, identifies trusted + mirrors, and reports zero cross-node entries + """ + report_data = { + "summary": { + "mirror_rules": [ + {"mirror": "mirror.registry.example:5000/ns"}, + {"mirror": "other.example/ns"}, + ] + }, + "images": [ + { + "image_id": ( + "quay.io/demo/app@sha256:" + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + ), + "node": "node-a", + }, + { + "image_id": ( + "quay.io/demo/other@sha256:" + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + ), + "node": "node-b", + }, + {"image_id": "no-digest-here", "node": "node-a"}, + ], + } + + with tempfile.TemporaryDirectory() as td: + report_path = os.path.join(td, "pulled_images_report.yaml") + output_path = os.path.join(td, "verified.yaml") + log_a = os.path.join(td, "node-a.crio.log") + log_b = os.path.join(td, "node-b.crio.log") + + with open(report_path, "w") as f: + yaml.safe_dump( + report_data, f, default_flow_style=False, sort_keys=False + ) + + with open(log_a, "w") as f: + f.write( + 'level=info msg="Pulled image: ' + "mirror.registry.example:5000/ns/app@sha256:" + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + '"\n' + ) + + with open(log_b, "w") as f: + f.write( + 'level=info msg="Pulled image: ' + "quay.io/demo/other@sha256:" + "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + '"\n' + ) + + set_module_args( + dict( + report_path=report_path, + output_path=output_path, + log_paths=[log_a, log_b], + ) + ) + + with self.assertRaises(AnsibleExitJson) as rst: + verify_pulled_report_crio.run_module() + + result = rst.exception.args[0] + self.assertTrue(result["changed"]) + self.assertEqual(result["log_files"], 2) + self.assertEqual(result["entries_with_digest"], 2) + self.assertEqual(result["cross_node_entries"], 0) + self.assertIn("node-a", result["nodes_with_evidence"]) + self.assertIn("node-b", result["nodes_with_evidence"]) + self.assertIn("mirror.registry.example:5000", result["trusted_mirrors"]) + + with open(output_path, "r") as f: + enriched = yaml.safe_load(f) + + img0 = enriched["images"][0] + self.assertEqual(img0["log_evidence_node"], "node-a") + self.assertEqual( + img0["log_evidence_uri"], + "mirror.registry.example:5000/ns/app", + ) + self.assertEqual(img0["node_verified_image_origin"], "mirror") + + img1 = enriched["images"][1] + self.assertEqual(img1["log_evidence_node"], "node-b") + self.assertEqual(img1["log_evidence_uri"], "quay.io/demo/other") + self.assertEqual(img1["node_verified_image_origin"], "source") + + def test_cross_node_match_increments_counter(self): + """ + GIVEN a pulled-images report listing an image on node-a + and a CRI-O log that records the same digest on node-b + WHEN the module processes the report against the logs + THEN the cross_node_entries counter is incremented and the + evidence node is set to the log's node (node-b) + """ + report_data = { + "summary": {"mirror_rules": [{"mirror": "mirror.example/ns"}]}, + "images": [ + { + "image_id": ( + "quay.io/demo/app@sha256:" + "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + ), + "node": "node-a", + } + ], + } + + with tempfile.TemporaryDirectory() as td: + report_path = os.path.join(td, "pulled_images_report.yaml") + output_path = os.path.join(td, "verified.yaml") + log_b = os.path.join(td, "node-b.crio.log") + + with open(report_path, "w") as f: + yaml.safe_dump( + report_data, f, default_flow_style=False, sort_keys=False + ) + + with open(log_b, "w") as f: + f.write( + 'level=info msg="Pulled image: ' + "mirror.example/ns/app@sha256:" + "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + '"\n' + ) + + set_module_args( + dict( + report_path=report_path, + output_path=output_path, + log_paths=[log_b], + ) + ) + + with self.assertRaises(AnsibleExitJson) as rst: + verify_pulled_report_crio.run_module() + + result = rst.exception.args[0] + self.assertEqual(result["entries_with_digest"], 1) + self.assertEqual(result["cross_node_entries"], 1) + + with open(output_path, "r") as f: + enriched = yaml.safe_load(f) + + img0 = enriched["images"][0] + self.assertEqual(img0["log_evidence_node"], "node-b") + self.assertEqual(img0["node_verified_image_origin"], "mirror") + + def test_fails_when_no_log_files(self): + """ + GIVEN an empty list of CRI-O log paths + WHEN the module is invoked + THEN it fails with an error indicating no log files were provided + """ + set_module_args( + dict( + report_path="/tmp/in.yaml", + output_path="/tmp/out.yaml", + log_paths=[], + ) + ) + + with self.assertRaises(AnsibleFailJson) as rst: + verify_pulled_report_crio.run_module() + + self.assertIn("No CRI-O log files", rst.exception.args[0]["msg"]) + + def test_fails_when_log_file_unreadable(self): + """ + GIVEN a log_paths entry that points to a non-existent file + WHEN the module tries to open it + THEN it fails with an error mentioning the file path + """ + with tempfile.TemporaryDirectory() as td: + report_path = os.path.join(td, "report.yaml") + with open(report_path, "w") as f: + yaml.safe_dump( + {"summary": {}, "images": []}, + f, + default_flow_style=False, + ) + + missing_log = os.path.join(td, "ghost.crio.log") + set_module_args( + dict( + report_path=report_path, + output_path=os.path.join(td, "out.yaml"), + log_paths=[missing_log], + ) + ) + + with self.assertRaises(AnsibleFailJson) as rst: + verify_pulled_report_crio.run_module() + + self.assertIn("Cannot read CRI-O log file", rst.exception.args[0]["msg"]) + self.assertIn("ghost.crio.log", rst.exception.args[0]["msg"]) + + def test_fails_when_report_unreadable(self): + """ + GIVEN a report_path that does not exist on disk + WHEN the module tries to open it + THEN it fails with an error mentioning the report path + """ + with tempfile.TemporaryDirectory() as td: + log_path = os.path.join(td, "node-a.crio.log") + with open(log_path, "w") as f: + f.write("") + + set_module_args( + dict( + report_path=os.path.join(td, "no_such_report.yaml"), + output_path=os.path.join(td, "out.yaml"), + log_paths=[log_path], + ) + ) + + with self.assertRaises(AnsibleFailJson) as rst: + verify_pulled_report_crio.run_module() + + self.assertIn("Cannot read report", rst.exception.args[0]["msg"]) + + def test_fails_when_report_has_invalid_yaml(self): + """ + GIVEN a report file whose contents are not valid YAML + WHEN the module tries to parse it + THEN it fails with an error about invalid YAML + """ + with tempfile.TemporaryDirectory() as td: + report_path = os.path.join(td, "bad.yaml") + with open(report_path, "w") as f: + f.write("{{: not: valid: yaml: [}") + + log_path = os.path.join(td, "node-a.crio.log") + with open(log_path, "w") as f: + f.write("") + + set_module_args( + dict( + report_path=report_path, + output_path=os.path.join(td, "out.yaml"), + log_paths=[log_path], + ) + ) + + with self.assertRaises(AnsibleFailJson) as rst: + verify_pulled_report_crio.run_module() + + self.assertIn("Invalid YAML in report", rst.exception.args[0]["msg"]) + + def test_fails_when_report_root_is_not_a_dict(self): + """ + GIVEN a report file whose YAML root is a list instead of a mapping + WHEN the module checks the structure + THEN it fails with an error about the root type + """ + with tempfile.TemporaryDirectory() as td: + report_path = os.path.join(td, "list.yaml") + with open(report_path, "w") as f: + yaml.safe_dump( + ["item1", "item2"], + f, + default_flow_style=False, + ) + + log_path = os.path.join(td, "node-a.crio.log") + with open(log_path, "w") as f: + f.write("") + + set_module_args( + dict( + report_path=report_path, + output_path=os.path.join(td, "out.yaml"), + log_paths=[log_path], + ) + ) + + with self.assertRaises(AnsibleFailJson) as rst: + verify_pulled_report_crio.run_module() + + self.assertIn("Report root must be a mapping", rst.exception.args[0]["msg"]) From c2afd2255976c3553a487e729a920c281f38b002 Mon Sep 17 00:00:00 2001 From: Roberto Alfieri Date: Mon, 13 Apr 2026 17:04:10 +0200 Subject: [PATCH 10/22] [ci] Add AGENTS.md and CLAUDE.md for AI agent guidance Add structured documentation for AI coding agents working on this repository. AGENTS.md provides repo-wide context: variable naming rules, generated file warnings, testing commands, commit conventions, and repository layout. CLAUDE.md adds Claude-specific behavioral guidance and references AGENTS.md as the primary source of truth. Co-Authored-By: Claude Opus 4.6 Signed-off-by: Roberto Alfieri --- AGENTS.md | 297 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 3 + 2 files changed, 300 insertions(+) create mode 100644 AGENTS.md create mode 100644 CLAUDE.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..4b2b0dd4b --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,297 @@ +# AGENTS.md + +## What is this repository + +CI-Framework is an Ansible collection (`cifmw.general`) that bootstraps +development and CI environments for RHOSO (Red Hat OpenStack Services on +OpenShift). It is **not** intended for production or long-lived deployments. + +The upstream repository lives at +`https://github.com/openstack-k8s-operators/ci-framework`. + +## Tech stack + +- **Ansible** collection (requires `ansible-core >= 2.15`). +- **Python 3** modules and plugins under `plugins/`. +- **Molecule** + Podman for per-role testing. +- **ansible-test** for unit, sanity, and integration tests on plugins. +- **Sphinx** for documentation (hosted on ReadTheDocs). +- **Zuul**, **GitHub Actions**, and **Prow** for CI. + +## Repository layout + +| Path | Description | +|---|---| +| `roles/` | Ansible roles. Each has `defaults/`, `tasks/`, `molecule/`, `README.md`. | +| `playbooks/` | Domain-specific playbooks in subdirectories (`adoption/`, `ceph/`, `bgp/`, etc.). Legacy numbered-stage playbooks are deprecated -- orchestration is handled by the `cifmw_setup` role. | +| `plugins/` | Collection plugins: `action/`, `filter/`, `modules/`, `module_utils/`. Test with `ansible-test`, not Molecule. | +| `tests/` | `ansible-test` suites: `unit/` (pytest), `integration/targets/`, `sanity/ignore.txt`. | +| `ci/` | CI-only playbooks: content provider, EDPM, kuttl, architecture validation, doc build, log collection. | +| `scenarios/` | Scenario-oriented variable packs used by framework flows. | +| `scripts/` | Environment setup, Molecule runner, ansible-test runner, Zuul/Molecule generation, snippet checks. | +| `docs/` | Sphinx sources under `docs/source/`. | +| `hooks/` | Hook playbooks consumed by the framework. Some hooks have their own `roles/` subdirectory. | +| `custom/` | Local overrides (gitignored except `README.md`). Safe for local dev experiments, never committed. | +| `containerfiles/` | Podman images for CI (`Containerfile.ci`, `Containerfile.tests`). | +| `group_vars/` | Shared group variables (e.g., `all.yml`). Changes here affect every playbook run. | +| `zuul.d/` | Zuul job and project definitions. **Some files are generated -- see below.** | +| `_skeleton_role_/` | Template used by `ansible-galaxy role init` when creating new roles. | + +## Critical rules + +### Variable naming + +All Ansible role variables **must** match the pattern `^cifmw_[a-z_][a-z0-9_]*$` +where after the `cifmw_` prefix comes the role name, then the variable name +(e.g., `cifmw_my_role_some_setting`). +This is enforced by `ansible-lint` with `strict: true` and `profile: production`. + +### FQCN required + +All module calls must use fully-qualified collection names. +The following FQCN rules are enabled in `.ansible-lint`: +`fqcn-builtins`, `fqcn[action]`, `fqcn[action-core]`, `fqcn[canonical]`, `fqcn[deep]`. + +### Generated files -- do not hand-edit + +The following files are **generated** by `scripts/create_role_molecule.py`: + +- `zuul.d/molecule.yaml` +- `zuul.d/projects.yaml` (molecule section) + +To regenerate: `make role_molecule`. To verify consistency: `make check_zuul_files`. +If you hand-edit these files, CI will reject the change. + +### Read-only / generated paths + +Do **not** modify these paths directly: + +| Path | Reason | +|---|---| +| `zuul.d/molecule.yaml` | Generated by `scripts/create_role_molecule.py`. | +| `zuul.d/projects.yaml` | Generated (molecule section). | +| `custom/` | Gitignored. Local-only overrides, never committed. | +| `hooks/playbooks/roles/` | Excluded from ansible-lint. Owned by hook authors. | + +All other paths (`roles/`, `playbooks/`, `plugins/`, `group_vars/`, `scenarios/`, +`hooks/playbooks/`, `ci/`, `scripts/`, `docs/`) are safe to edit following the +conventions in this file. + +### Debugging patterns + +Use `block`/`rescue`/`always` for complex task sequences. Dump relevant +variables in the `rescue` block, then `ansible.builtin.fail` to stop +execution. This makes CI failures much easier to diagnose. + +### Do not be too verbose + +There is no need to provide explanation for each variable in each task file. +It is enough when the variable description is available in the role README +file. Add comments only to complex code. + +### Do not create too many variables + +Balance the amount of variables: there is no need to create additional +variables especially for static values that are only used once by another +module or role. In that case, fewer variables means more clarity. + +### Make tasks easier to debug + +Tasks should not do too many things in a single step -- on failure that +becomes difficult to debug. In some cases, adding more small, fast tasks +is better than one large task. Add debug messages only in very complex +places, not everywhere. + +## Playbooks + +Do not rely on the `playbooks/` directory as the primary orchestration layer. +The numbered-stage playbooks (`01-bootstrap.yml`, `02-infra.yml`, etc.) are +**deprecated** -- orchestration is now handled by the `cifmw_setup` role. + +The `playbooks/` directory still contains: +- **Subdirectories** (`adoption/`, `ceph/`, `bgp/`, `dcn/`, `multi-namespace/`) + with domain-specific flows. +- **Standalone playbooks** (`hooks.yml`, `update.yml`, `dcn.yml`, `nfs.yml`, + `switches_config.yml`, etc.) for specific operations. + +## Creating a new role + +Always use the Makefile: + +``` +make new_role ROLE_NAME=my_role +``` + +This generates the skeleton, Molecule config, and updates Zuul jobs. +Every new role must have: + +1. A `README.md` documenting its parameters. +2. Molecule test scenarios. +3. Documentation that builds cleanly (checked in CI). + +If the role cannot be tested via Molecule, remove the `molecule/` directory +and run `make role_molecule` to regenerate Zuul jobs. Add a note in the +role's `README.md` explaining why. + +## Testing + +### Commands + +| Command | What it does | +|---|---| +| `make pre_commit` | Runs pre-commit hooks (shellcheck, black, ansible-lint) with dependency install. | +| `make molecule` | Runs Molecule tests for all roles with dependency install. | +| `make ansible_test` | Runs ansible-test (units + sanity + integration) with dependency install. | +| `make tests` | Runs pre-commit + Molecule. | +| `make check_zuul_files` | Regenerates Zuul YAML and fails if it differs from committed files. | +| `make docs` | Builds Sphinx documentation under `docs/_build/html/`. | +| `make spelling` | Runs `pyspelling` on docs. | +| `make plugin-development-enable` | Rewrites import paths and sets `PYTHONPATH` for local plugin dev. | +| `make plugin-development-disable` | Reverts the changes made by `plugin-development-enable`. | + +### Container-based testing (requires Podman) + +| Command | What it does | +|---|---| +| `make run_ctx_pre_commit` | Pre-commit in a container. | +| `make run_ctx_molecule` | Molecule in a container. | +| `make run_ctx_ansible_test` | ansible-test in a container. | +| `make run_ctx_all_tests` | All of the above. | + +### Molecule specifics + +- Config: `.config/molecule/config_podman.yml` (host) or `config_local.yml` (container). +- Test a single role: `TEST_SINGLE_ROLE=my_role make molecule` or `make run_ctx_molecule`. +- Molecule scenarios live under `roles//molecule/`. + +### Validation priority + +When verifying a change, run checks in this order: + +1. `pre-commit run --all-files` — fast lint pass (ansible-lint, black, shellcheck). +2. `TEST_SINGLE_ROLE= make molecule` — targeted Molecule test. +3. `make ansible_test` — plugin unit/sanity/integration tests (only if plugins changed). +4. `make docs` — only if documentation was modified. + +### macOS limitations + +The Makefile test targets (`make tests`, `make molecule`, `make ansible_test`, +`make setup_tests`, `make setup_molecule`, and their `_nodeps` variants) **do not +work on macOS**. The underlying scripts use `readlink -f`, which is not available +on macOS. These targets are designed for Linux CI environments only. + +## Linting and code style + +- **ansible-lint**: `production` profile, `strict: true`. Config in `.ansible-lint`. +- **Python**: Formatted with `black`. +- **Shell**: Checked with `shellcheck` (severity=error, excludes SC2071). +- **Pre-commit**: Config in `.pre-commit-config.yaml`. Run with `make pre_commit` or `make run_ctx_pre_commit`. +- **Spelling**: `pyspelling` on docs. Run with `make spelling`. + +### Excluded from linting + +ansible-lint skips: `.github/`, `scripts/`, `docs/`, `containerfiles/`, `ci/`, +and the generated Zuul files (`zuul.d/projects.yaml`, `zuul.d/molecule.yaml`). + +## Commit conventions + +- **Title**: Must begin with the role name in brackets or parentheses: + `[my_role] Add feature X` or `(my_role) Fix bug Y`. + If changes span multiple roles, use `[multiple]` or `(multiple)`. + For cross-cutting changes use a category: `[ci]`, `[docs]`, `[Feature]`. +- **Body**: Must be longer than 10 characters and describe **why** the change + was made. +- **Sign-off**: Required (`git commit --signoff`). The sign-off certifies a + [DCO](https://developercertificate.org/). AI agents cannot sign off on behalf + of a human -- the committer must add it themselves or amend the commit. +- **AI attribution**: Use `Co-Authored-By:` for substantial AI-generated code, + `Assisted-By:` for minor AI help. Disclose the scope in the PR description. +- **Ticket references**: Link Jira cards in the commit message body: + `Closes: ANVIL-123` (resolves the ticket) or + `Related-Issue: #OSPRH-12345` (related but does not close). +- **Cross-repo dependencies**: When a change depends on an unmerged PR/MR in + another repository, add `Depends-On: ` in the PR/MR + description. Zuul uses this to test the changes together. + +### Commit strategy + +To keep a clean git history, prefer a single commit per feature or fix: + +1. Create the initial commit normally. +2. For subsequent changes on the same branch, amend the existing commit + (`git commit --amend`) instead of creating new ones. +3. After amending, use `git push --force` to update the remote branch. + +Never push directly to `main` — it is a protected branch. Always work on +a feature branch. Force pushing is only appropriate for **solo feature +branches**, never for `main` or shared branches. + +If a change is not directly related to the main goal of the pull request +but is required for it to work, add it as a **separate commit**. When +amending, be careful to edit only the commits that belong to the same +pull request. + +## Branch workflow + +- The default branch is `main`. +- Feature work happens on topic branches. +- PRs target `main` unless otherwise specified. +- Branch names should be descriptive (e.g., `fix-reproducer-pull-secret`, + `feature/OSPRH-12345-new-role`). + +## PR process + +- PRs are auto-set to draft on open. To undraft, push a non-`nit:` change. +- Minimum **2 approvals** required (excluding the author). +- Security-sensitive code requires additional maintainer review. +- Ownership is defined in `OWNERS` and `OWNERS_ALIASES`. + +## Relationship to ci-framework-jobs + +The `ci-framework-jobs` repository holds downstream Zuul job definitions that +consume this repository. Jobs in that repo declare +`required-projects: openstack-k8s-operators/ci-framework` and +`roles: zuul: openstack-k8s-operators/ci-framework` so Zuul checks out this +repo and exposes its roles during job execution. Uni jobs orchestrate this +repo's `reproducer.yml` playbook as their main entry point. + +When making changes here that affect CI behavior, coordinate with the +corresponding job definitions in `ci-framework-jobs`. + +## Plugin development + +To develop collection plugins locally without installing the collection: + +``` +make plugin-development-enable +``` + +This rewrites import paths and sets `PYTHONPATH`. Revert with: + +``` +make plugin-development-disable +``` + +Plugins are tested with `ansible-test`, not Molecule. + +## Documentation first + +Before searching the web or relying on general knowledge, check local +documentation: +- `docs/source/` — Sphinx sources for the ci-framework collection. +- Role-level `README.md` files under `roles//`. +- The downstream CI docs repository at + `https://gitlab.cee.redhat.com/ci-framework/docs` covers job types, + pipelines, troubleshooting, and glossary. + +## Confirm before acting + +Before performing expensive or broad-impact operations, confirm with the +user first: + +- Running full test suites (`make tests`, `make molecule`) — ask whether a + targeted run (`TEST_SINGLE_ROLE=`) is sufficient. +- Modifying `group_vars/all.yml` — changes here affect every playbook run. +- Editing roles used by multiple playbooks — flag the blast radius. +- Cross-repo changes that require coordinated updates in `ci-framework-jobs` + or `architecture`. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..f6aa6c026 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,3 @@ +# CLAUDE.md + +@AGENTS.md From 6807aa06c8a019ab3020c3c8ac6d3e27d5e64a2e Mon Sep 17 00:00:00 2001 From: Luca Miccini Date: Thu, 7 May 2026 07:44:26 +0200 Subject: [PATCH 11/22] Add dt-sharded-compact reproducer scenario New reproducer for the dt-sharded deployment topology which uses per-service dedicated galera, rabbitmq and mamcached clusters. Based on dt-vhosts-compact with designateext network removed (not used by dt-sharded) and ceph/service-values paths adjusted. Signed-Off-By: Luca Miccini Co-Authored-By: Claude Opus 4.6 --- scenarios/reproducers/dt-sharded-compact.yml | 322 +++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 scenarios/reproducers/dt-sharded-compact.yml diff --git a/scenarios/reproducers/dt-sharded-compact.yml b/scenarios/reproducers/dt-sharded-compact.yml new file mode 100644 index 000000000..686dd3e30 --- /dev/null +++ b/scenarios/reproducers/dt-sharded-compact.yml @@ -0,0 +1,322 @@ +--- +cifmw_architecture_scenario: dt-sharded + +# Automation section. Most of those parameters will be passed to the +# controller-0 as-is and be consumed by the `deploy-va.sh` script. +# Please note, all paths are on the controller-0, meaning managed by the +# Framework. Please do not edit them! +_arch_repo: "{{ cifmw_architecture_repo }}" +cifmw_ceph_client_vars: /tmp/ceph_client.yml +cifmw_ceph_client_values_post_ceph_path_src: >- + {{ _arch_repo }}/examples/dt/dt-sharded/values.yaml +cifmw_ceph_client_values_post_ceph_path_dst: >- + {{ cifmw_ceph_client_values_post_ceph_path_src }} +cifmw_ceph_client_service_values_post_ceph_path_src: >- + {{ _arch_repo }}/examples/dt/dt-sharded/control-plane/service-values.yaml +cifmw_ceph_client_service_values_post_ceph_path_dst: >- + {{ cifmw_ceph_client_service_values_post_ceph_path_src }} + + +# workaround https://issues.redhat.com/browse/OSPRH-6675 +cifmw_ceph_spec_public_network: "{{ cifmw_networking_definition.networks.ctlplane.network }}" + +# HERE if you want to override kustomization, you can uncomment this parameter +# and push the data structure you want to apply. +# cifmw_architecture_user_kustomize: +# stage_0: +# 'network-values': +# data: +# starwars: Obiwan + +# HERE, if you want to stop the deployment loop at any stage, you can uncomment +# the following parameter and update the value to match the stage you want to +# reach. Known stages are: +# pre_kustomize_stage_INDEX +# pre_apply_stage_INDEX +# post_apply_stage_INDEX +# +# cifmw_deploy_architecture_stopper: + +cifmw_allow_vms_to_reach_osp_api: true + +# Full networking definition including designate network +cifmw_networking_definition: + networks: + ctlplane: + network: "192.168.122.0/24" + gateway: "192.168.122.1" + dns: + - "192.168.122.1" + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + netconfig: + ranges: + - start: 100 + end: 120 + - start: 150 + end: 170 + metallb: + ranges: + - start: 80 + end: 90 + internalapi: + network: "172.17.0.0/24" + vlan: 20 + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + metallb: + ranges: + - start: 80 + end: 99 + netconfig: + ranges: + - start: 100 + end: 250 + storage: + network: "172.18.0.0/24" + vlan: 21 + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + metallb: + ranges: + - start: 80 + end: 90 + netconfig: + ranges: + - start: 100 + end: 250 + tenant: + network: "172.19.0.0/24" + vlan: 22 + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + metallb: + ranges: + - start: 80 + end: 90 + netconfig: + ranges: + - start: 100 + end: 250 + storagemgmt: + network: "172.20.0.0/24" + vlan: 23 + mtu: 1500 + tools: + netconfig: + ranges: + - start: 100 + end: 250 + designate: + network: "172.26.0.0/24" + vlan: 24 + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + metallb: + ranges: + - start: 80 + end: 90 + netconfig: + ranges: + - start: 100 + end: 250 + designateext: + network: "172.34.0.0/24" + vlan: 34 + mtu: 1500 + tools: + multus: + ranges: + - start: 30 + end: 70 + metallb: + ranges: + - start: 80 + end: 90 + netconfig: + ranges: + - start: 100 + end: 250 + external: + network: "10.0.0.0/24" + vlan: 22 + mtu: 1500 + tools: + netconfig: + ranges: + - start: 100 + end: 250 + group-templates: + ocps: + network-template: + range: + start: 10 + length: 10 + networks: + ctlplane: {} + internalapi: + trunk-parent: ctlplane + tenant: + trunk-parent: ctlplane + storage: + trunk-parent: ctlplane + designate: + trunk-parent: ctlplane + designateext: + trunk-parent: ctlplane + computes: + network-template: + range: + start: 100 + length: 21 + networks: + ctlplane: {} + internalapi: + trunk-parent: ctlplane + tenant: + trunk-parent: ctlplane + storage: + trunk-parent: ctlplane + storagemgmt: + trunk-parent: ctlplane + cephs: + network-template: + range: + start: 150 + length: 21 + networks: + ctlplane: {} + internalapi: + trunk-parent: ctlplane + storage: + trunk-parent: ctlplane + storagemgmt: + trunk-parent: ctlplane + tenant: + trunk-parent: ctlplane + instances: + controller-0: + networks: + ctlplane: + ip: "192.168.122.9" + +# HCI requires bigger size to hold OCP on OSP disks +cifmw_block_device_size: 100G +cifmw_libvirt_manager_compute_disksize: 160 +cifmw_libvirt_manager_compute_memory: 50 +cifmw_libvirt_manager_compute_cpus: 8 + +cifmw_libvirt_manager_configuration: + networks: + osp_trunk: | + + osp_trunk + + + + + + + ocpbm: | + + ocpbm + + + + + + + ocppr: | + + ocppr + + + + vms: + ocp: + amount: 3 + admin_user: core + image_local_dir: "{{ cifmw_basedir }}/images/" + disk_file_name: "ocp_master" + disksize: "100" + cpus: 16 + memory: 64 + root_part_id: 4 + uefi: true + nets: + - ocppr + - ocpbm + - osp_trunk + compute: + uefi: "{{ cifmw_use_uefi }}" + root_part_id: "{{ cifmw_root_partition_id }}" + amount: "{{ [cifmw_libvirt_manager_compute_amount|int, 3] | max }}" + image_url: "{{ cifmw_discovered_image_url }}" + sha256_image_name: "{{ cifmw_discovered_hash }}" + image_local_dir: "{{ cifmw_basedir }}/images/" + disk_file_name: "compute-base-os.qcow2" + disksize: "{{ [cifmw_libvirt_manager_compute_disksize|int, 50] | max }}" + memory: "{{ [cifmw_libvirt_manager_compute_memory|int, 8] | max }}" + cpus: "{{ [cifmw_libvirt_manager_compute_cpus|int, 4] | max }}" + nets: + - ocpbm + - osp_trunk + controller: + uefi: "{{ cifmw_use_uefi }}" + root_part_id: "{{ cifmw_root_partition_id }}" + image_url: "{{ cifmw_discovered_image_url }}" + sha256_image_name: "{{ cifmw_discovered_hash }}" + image_local_dir: "{{ cifmw_basedir }}/images/" + disk_file_name: "base-os.qcow2" + disksize: 50 + memory: 8 + cpus: 4 + nets: + - ocpbm + - osp_trunk + +## devscript support for OCP deploy +cifmw_devscripts_config_overrides: + fips_mode: "{{ cifmw_fips_enabled | default(false) | bool }}" + +# Note: with that extra_network_names "osp_trunk", we instruct +# devscripts role to create a new network, and associate it to +# the OCP nodes. This one is a "private network", and will hold +# the VLANs used for network isolation. + +# Please create a custom env file to provide: +# cifmw_devscripts_ci_token: +# cifmw_devscripts_pull_secret: + +# Test Ceph file and object storage (block is enabled by default) +cifmw_ceph_daemons_layout: + rgw_enabled: true + dashboard_enabled: false + cephfs_enabled: true + ceph_nfs_enabled: true + +cifmw_deploy_obs: true From ee95beaa92c617a3ca1bb2fef72b90190698413c Mon Sep 17 00:00:00 2001 From: Sergii Golovatiuk Date: Tue, 5 May 2026 19:25:29 +0200 Subject: [PATCH 12/22] [libvirt_manager] Add predictable interface names Enable systemd predictable network interface naming inside guest VMs by removing net.ifnames=0 from kernel args via virt-customize. This gives guests consistent PCI-topology-based names (enp1s0, enp2s0, etc.) instead of legacy ethN naming. Predictable network interfaces are requirement for testing Leapp upgrade functionality. Controlled by cifmw_libvirt_manager_predictable_nic_names (defaults to false). Jira: OSPRH-29381 Co-Authored-By: Lukas Bezdicka Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Sergii Golovatiuk --- roles/libvirt_manager/defaults/main.yml | 1 + roles/libvirt_manager/tasks/create_vms.yml | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/roles/libvirt_manager/defaults/main.yml b/roles/libvirt_manager/defaults/main.yml index f1e9ba2e7..e71fbf9f6 100644 --- a/roles/libvirt_manager/defaults/main.yml +++ b/roles/libvirt_manager/defaults/main.yml @@ -84,6 +84,7 @@ cifmw_libvirt_manager_firewalld_zone_libvirt_forward: true cifmw_libvirt_manager_firewalld_default_zone: public cifmw_libvirt_manager_firewalld_default_zone_masquerade: true cifmw_libvirt_manager_attach_dummy_interface_on_bridges: true +cifmw_libvirt_manager_predictable_nic_names: false cifmw_libvirt_manager_extra_network_configuration: {} cifmw_libvirt_manager_vm_users: [] diff --git a/roles/libvirt_manager/tasks/create_vms.yml b/roles/libvirt_manager/tasks/create_vms.yml index a9cca3319..0bc5e8b8c 100644 --- a/roles/libvirt_manager/tasks/create_vms.yml +++ b/roles/libvirt_manager/tasks/create_vms.yml @@ -61,6 +61,14 @@ xml: "{{ lookup('template', cifmw_libvirt_manager_vm_template) }}" uri: "qemu:///system" + - name: "Disable net.ifnames=0 for {{ vm }}" + when: + - vm_data.disk_file_name != 'blank' + - cifmw_libvirt_manager_predictable_nic_names | default(false) | bool + - vm is match('^.*(compute).*$') + ansible.builtin.command: + cmd: "virt-customize -c qemu:///system --domain cifmw-{{ vm }} --run-command 'grubby --remove-args=net.ifnames=0 --update-kernel=ALL'" + - name: "Attach listed networks to the VMs {{ vm }}" vars: vm_item: "{{ vm }}" From 0c82f92a15425a9ded621498b6613a8469716ee6 Mon Sep 17 00:00:00 2001 From: Sergii Golovatiuk Date: Thu, 14 May 2026 20:03:56 +0200 Subject: [PATCH 13/22] [ansible] Add SSH keepalive to prevent connection drops during long tasks Long-running tasks like `oc adm wait-for-stable-cluster` after certificate rotation can cause SSH connections to be dropped by intermediate network devices (firewalls, NAT) due to inactivity. Add ServerAliveInterval and ServerAliveCountMax to maintain the connection alive. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Sergii Golovatiuk --- ansible.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible.cfg b/ansible.cfg index 1ba690483..8101b03b5 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -20,4 +20,4 @@ pipelining = True any_errors_fatal = True jinja2_native = True [ssh_connection] -ssh_args = -o ControlMaster=auto -o ControlPersist=300 +ssh_args = -o ControlMaster=auto -o ControlPersist=300 -o ServerAliveInterval=30 -o ServerAliveCountMax=10 From 77aad817cf4dfcc383e04117acd2258544ade9e5 Mon Sep 17 00:00:00 2001 From: Roberto Alfieri Date: Fri, 15 May 2026 10:22:43 +0200 Subject: [PATCH 14/22] [cifmw_cephadm] Fix log path when post.yml delegates to localhost cifmw_cephadm_log_path uses ansible_user_dir which resolves to /root on compute nodes running with become: true. The post.yml and logs.yml tasks delegate_to: localhost to write log files, but on the controller the zuul user cannot create directories under /root. Use cifmw_basedir (always /home/zuul/ci-framework-data in CI) with a fallback to the original expression for non-CI contexts. Related-Issue: ANVIL-109 Co-authored-by: Cursor Signed-off-by: Roberto Alfieri --- roles/cifmw_cephadm/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/cifmw_cephadm/defaults/main.yml b/roles/cifmw_cephadm/defaults/main.yml index f8c4aa8ae..2d516ed06 100644 --- a/roles/cifmw_cephadm/defaults/main.yml +++ b/roles/cifmw_cephadm/defaults/main.yml @@ -110,7 +110,7 @@ cifmw_cephadm_urischeme: "http" cifmw_cephadm_config_key_set_ssl_option: no_sslv2:sslv3:no_tlsv1:no_tlsv1_1 # Deployment logs variables cifmw_cephadm_log_dump: true -cifmw_cephadm_log_path: "{{ ansible_user_dir ~ '/ci-framework-data/logs/ceph' }}" +cifmw_cephadm_log_path: "{{ cifmw_basedir | default(ansible_user_dir ~ '/ci-framework-data') }}/logs/ceph" cifmw_cephadm_log_commands: # Get deployed Ceph daemons - type: "daemons" From f6f29fa0c771016cc13bf3616746b7c45b24ccba Mon Sep 17 00:00:00 2001 From: Michael Burke Date: Fri, 15 May 2026 13:58:49 -0400 Subject: [PATCH 15/22] [docs] Documentation fixes A couple of spelling and formatting errors weren't caught in their original prs. This adds some words to the dictionary and fixes an error where an asterisk needed to be escaped. Signed-off-by: Michael Burke --- docs/dictionary/en-custom.txt | 6 ++++++ plugins/modules/verify_pulled_report_crio.py | 2 +- tests/sanity/ignore.txt | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/dictionary/en-custom.txt b/docs/dictionary/en-custom.txt index 016199c88..62e4ef97f 100644 --- a/docs/dictionary/en-custom.txt +++ b/docs/dictionary/en-custom.txt @@ -1,3 +1,4 @@ +AES APIs Amartya AssignedTeam @@ -14,8 +15,11 @@ Idempotency LDAP LLM MachineConfig +Marjanovic +Nemanja NICs NodeHealthCheck +PyYAML RHCOS SNO Sinha @@ -24,6 +28,7 @@ ZipFile aaabbcc abcdefghij addr +aes afuscoar alertmanager amartyasinha @@ -385,6 +390,7 @@ namespace namespaces ncia ndczmditnzbhni +nemarjan netconfig netmask networkattachmentdefinition diff --git a/plugins/modules/verify_pulled_report_crio.py b/plugins/modules/verify_pulled_report_crio.py index 94751d89f..b2ccf4b7a 100644 --- a/plugins/modules/verify_pulled_report_crio.py +++ b/plugins/modules/verify_pulled_report_crio.py @@ -61,7 +61,7 @@ log_glob: description: Glob under I(log_dir). Used only when I(log_dir) is set. required: false - default: "*.crio.log" + default: "\\*.crio.log" type: str author: diff --git a/tests/sanity/ignore.txt b/tests/sanity/ignore.txt index f02918209..ac250167a 100644 --- a/tests/sanity/ignore.txt +++ b/tests/sanity/ignore.txt @@ -6,3 +6,4 @@ plugins/modules/cephx_key.py validate-modules:missing-gplv3-license # ignore lic plugins/modules/krb_request.py validate-modules:missing-gplv3-license # ignore license check plugins/modules/pem_read.py validate-modules:missing-gplv3-license # ignore license check plugins/modules/verify_pulled_report_crio.py validate-modules:missing-gplv3-license # ignore license check +plugins/modules/verify_pulled_report_crio.py validate-modules:doc-default-does-not-match-spec # account * escape in docs From 094f920c522e943522c2091003bf6e243b5deff9 Mon Sep 17 00:00:00 2001 From: Andrew Bays Date: Wed, 6 May 2026 09:39:13 -0400 Subject: [PATCH 16/22] [deploy_minio] Add MinIO deployment role Deploy MinIO as a lightweight S3-compatible object store for use as the Velero backup target in development and CI environments. Signed-off-by: Andrew Bays Signed-off-by: Martin Schuppert Co-Authored-By: Claude Opus 4.6 --- roles/deploy_minio/defaults/main.yml | 28 +++++ roles/deploy_minio/meta/main.yml | 8 ++ roles/deploy_minio/tasks/main.yml | 90 +++++++++++++++ roles/deploy_minio/templates/minio.yaml.j2 | 125 +++++++++++++++++++++ zuul.d/molecule.yaml | 9 ++ zuul.d/projects.yaml | 1 + 6 files changed, 261 insertions(+) create mode 100644 roles/deploy_minio/defaults/main.yml create mode 100644 roles/deploy_minio/meta/main.yml create mode 100644 roles/deploy_minio/tasks/main.yml create mode 100644 roles/deploy_minio/templates/minio.yaml.j2 diff --git a/roles/deploy_minio/defaults/main.yml b/roles/deploy_minio/defaults/main.yml new file mode 100644 index 000000000..4d2c48c63 --- /dev/null +++ b/roles/deploy_minio/defaults/main.yml @@ -0,0 +1,28 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# All variables intended for modification should be placed in this file. +# All variables within this role should have a prefix of "cifmw_deploy_minio" + +cifmw_deploy_minio_namespace: minio +cifmw_deploy_minio_storage_size: 10Gi +cifmw_deploy_minio_storage_class: "" +cifmw_deploy_minio_root_user: minio +cifmw_deploy_minio_root_password: minio123 +cifmw_deploy_minio_buckets: + - velero + - loki +cifmw_deploy_minio_image: quay.io/minio/minio:latest diff --git a/roles/deploy_minio/meta/main.yml b/roles/deploy_minio/meta/main.yml new file mode 100644 index 000000000..ea86e0532 --- /dev/null +++ b/roles/deploy_minio/meta/main.yml @@ -0,0 +1,8 @@ +--- +galaxy_info: + role_name: deploy_minio + namespace: cifmw + author: Red Hat + description: Deploy MinIO as S3-compatible storage backend + license: Apache-2.0 + min_ansible_version: "2.11" diff --git a/roles/deploy_minio/tasks/main.yml b/roles/deploy_minio/tasks/main.yml new file mode 100644 index 000000000..bb2355647 --- /dev/null +++ b/roles/deploy_minio/tasks/main.yml @@ -0,0 +1,90 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Deploy MinIO +# +# Deploys MinIO as an S3-compatible storage backend. +# Creates namespace, PVC, Deployment, Service, Routes. +# Bucket is created via mkdir in the container command. +# +# Output facts: +# cifmw_deploy_minio_access_key: Root user (for OADP credentials) +# cifmw_deploy_minio_secret_key: Root password (for OADP credentials) + +- name: Create temp directory for rendered templates + ansible.builtin.tempfile: + state: directory + prefix: deploy-minio- + register: _deploy_minio_rendered_dir + +- name: Render MinIO manifests + ansible.builtin.template: + src: minio.yaml.j2 + dest: "{{ _deploy_minio_rendered_dir.path }}/minio.yaml" + mode: "0644" + +- name: Apply MinIO manifests + kubernetes.core.k8s: + src: "{{ _deploy_minio_rendered_dir.path }}/minio.yaml" + state: present + +- name: Wait for MinIO deployment to be ready + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + name: minio + namespace: "{{ cifmw_deploy_minio_namespace }}" + wait: true + wait_timeout: 300 + wait_condition: + type: Available + status: "True" + +- name: Export credentials for downstream roles + ansible.builtin.set_fact: + cifmw_deploy_minio_access_key: "{{ cifmw_deploy_minio_root_user }}" + cifmw_deploy_minio_secret_key: "{{ cifmw_deploy_minio_root_password }}" + +- name: Get MinIO console route + kubernetes.core.k8s_info: + api_version: route.openshift.io/v1 + kind: Route + name: minio-console + namespace: "{{ cifmw_deploy_minio_namespace }}" + register: _minio_console_route + +- name: Get MinIO API route + kubernetes.core.k8s_info: + api_version: route.openshift.io/v1 + kind: Route + name: minio-api + namespace: "{{ cifmw_deploy_minio_namespace }}" + register: _minio_api_route + +- name: Print setup complete + ansible.builtin.debug: + msg: + - "========================================" + - "MinIO Setup Complete" + - "========================================" + - "Console: https://{{ _minio_console_route.resources[0].spec.host }}" + - "API: https://{{ _minio_api_route.resources[0].spec.host }}" + - "Buckets: {{ cifmw_deploy_minio_buckets | join(', ') }}" + +- name: Cleanup rendered templates + ansible.builtin.file: + path: "{{ _deploy_minio_rendered_dir.path }}" + state: absent diff --git a/roles/deploy_minio/templates/minio.yaml.j2 b/roles/deploy_minio/templates/minio.yaml.j2 new file mode 100644 index 000000000..cfbb5f55a --- /dev/null +++ b/roles/deploy_minio/templates/minio.yaml.j2 @@ -0,0 +1,125 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: {{ cifmw_deploy_minio_namespace }} + labels: + name: {{ cifmw_deploy_minio_namespace }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: minio-pvc + namespace: {{ cifmw_deploy_minio_namespace }} +spec: + accessModes: + - ReadWriteOnce +{% if cifmw_deploy_minio_storage_class %} + storageClassName: {{ cifmw_deploy_minio_storage_class }} +{% endif %} + resources: + requests: + storage: {{ cifmw_deploy_minio_storage_size }} +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: minio + namespace: {{ cifmw_deploy_minio_namespace }} +spec: + selector: + matchLabels: + app: minio + strategy: + type: Recreate + template: + metadata: + labels: + app: minio + spec: + containers: + - name: minio + image: {{ cifmw_deploy_minio_image }} + command: + - /bin/bash + - -c + - | +{% for bucket in cifmw_deploy_minio_buckets %} + mkdir -p /data/{{ bucket }} && \ +{% endfor %} + minio server /data --console-address :9001 + env: + - name: MINIO_ROOT_USER + value: "{{ cifmw_deploy_minio_root_user }}" + - name: MINIO_ROOT_PASSWORD + value: "{{ cifmw_deploy_minio_root_password }}" + ports: + - containerPort: 9000 + name: api + - containerPort: 9001 + name: console + volumeMounts: + - name: data + mountPath: /data + livenessProbe: + httpGet: + path: /minio/health/live + port: 9000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /minio/health/ready + port: 9000 + initialDelaySeconds: 30 + periodSeconds: 20 + volumes: + - name: data + persistentVolumeClaim: + claimName: minio-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: minio + namespace: {{ cifmw_deploy_minio_namespace }} +spec: + selector: + app: minio + ports: + - name: api + port: 9000 + targetPort: 9000 + - name: console + port: 9001 + targetPort: 9001 +--- +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: minio-console + namespace: {{ cifmw_deploy_minio_namespace }} +spec: + to: + kind: Service + name: minio + port: + targetPort: console + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect +--- +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: minio-api + namespace: {{ cifmw_deploy_minio_namespace }} +spec: + to: + kind: Service + name: minio + port: + targetPort: api + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect diff --git a/zuul.d/molecule.yaml b/zuul.d/molecule.yaml index da963e698..d6ff17cf9 100644 --- a/zuul.d/molecule.yaml +++ b/zuul.d/molecule.yaml @@ -947,6 +947,15 @@ - ^.config/molecule/.* name: cifmw-molecule-deploy_loki parent: cifmw-molecule-noop +- job: + files: + - ^common-requirements.txt + - ^test-requirements.txt + - ^roles/deploy_minio/.* + - ^ci/playbooks/molecule.* + - ^.config/molecule/.* + name: cifmw-molecule-deploy_minio + parent: cifmw-molecule-noop - job: files: - ^common-requirements.txt diff --git a/zuul.d/projects.yaml b/zuul.d/projects.yaml index fe2a87ee5..e6e3a9ad9 100644 --- a/zuul.d/projects.yaml +++ b/zuul.d/projects.yaml @@ -44,6 +44,7 @@ - cifmw-molecule-copy_container - cifmw-molecule-deploy_bmh - cifmw-molecule-deploy_loki + - cifmw-molecule-deploy_minio - cifmw-molecule-devscripts - cifmw-molecule-discover_latest_image - cifmw-molecule-dlrn_promote From 062d4a64ee49fa3fdd8eda385d480865d4f2357f Mon Sep 17 00:00:00 2001 From: Andrew Bays Date: Wed, 6 May 2026 09:40:22 -0400 Subject: [PATCH 17/22] [openshift_adp] Add OADP operator installation role Install and configure the OADP (OpenShift API for Data Protection) operator with an S3-compatible storage backend, create the DataProtectionApplication CR, set up VolumeSnapshotClass for CSI snapshots, and verify the BackupStorageLocation is available. Signed-off-by: Andrew Bays Signed-off-by: Martin Schuppert Co-Authored-By: Claude Opus 4.6 --- docs/dictionary/en-custom.txt | 2 + roles/openshift_adp/defaults/main.yml | 33 +++ roles/openshift_adp/meta/main.yml | 8 + roles/openshift_adp/tasks/main.yml | 317 ++++++++++++++++++++++++++ zuul.d/molecule.yaml | 9 + zuul.d/projects.yaml | 1 + 6 files changed, 370 insertions(+) create mode 100644 roles/openshift_adp/defaults/main.yml create mode 100644 roles/openshift_adp/meta/main.yml create mode 100644 roles/openshift_adp/tasks/main.yml diff --git a/docs/dictionary/en-custom.txt b/docs/dictionary/en-custom.txt index 62e4ef97f..0e0becdd6 100644 --- a/docs/dictionary/en-custom.txt +++ b/docs/dictionary/en-custom.txt @@ -424,6 +424,7 @@ num nvme nwy nzgdh +OADP oauth observability oc @@ -641,6 +642,7 @@ vcpus vda venv vexxhost +Velero virbr virsh virt diff --git a/roles/openshift_adp/defaults/main.yml b/roles/openshift_adp/defaults/main.yml new file mode 100644 index 000000000..088c27a90 --- /dev/null +++ b/roles/openshift_adp/defaults/main.yml @@ -0,0 +1,33 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# All variables intended for modification should be placed in this file. +# All variables within this role should have a prefix of "cifmw_openshift_adp" + +# OADP operator +cifmw_openshift_adp_namespace: openshift-adp +cifmw_openshift_adp_channel: stable +cifmw_openshift_adp_enable_node_agent: true + +# S3 backend (MinIO or other S3-compatible) +cifmw_openshift_adp_s3_namespace: minio +cifmw_openshift_adp_s3_bucket: velero +cifmw_openshift_adp_s3_region: minio +cifmw_openshift_adp_s3_prefix: rhoso +cifmw_openshift_adp_s3_force_path_style: true +cifmw_openshift_adp_s3_insecure_skip_tls: true +# cifmw_openshift_adp_s3_access_key: REQUIRED +# cifmw_openshift_adp_s3_secret_key: REQUIRED diff --git a/roles/openshift_adp/meta/main.yml b/roles/openshift_adp/meta/main.yml new file mode 100644 index 000000000..551cdb974 --- /dev/null +++ b/roles/openshift_adp/meta/main.yml @@ -0,0 +1,8 @@ +--- +galaxy_info: + role_name: openshift_adp + namespace: cifmw + author: Red Hat + description: Install and configure OADP (OpenShift API for Data Protection) + license: Apache-2.0 + min_ansible_version: "2.11" diff --git a/roles/openshift_adp/tasks/main.yml b/roles/openshift_adp/tasks/main.yml new file mode 100644 index 000000000..b6c1e3d77 --- /dev/null +++ b/roles/openshift_adp/tasks/main.yml @@ -0,0 +1,317 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# OADP Setup +# +# Installs and configures OADP (OpenShift API for Data Protection) +# with an S3-compatible storage backend. + +- name: Verify S3 credentials are provided + ansible.builtin.fail: + msg: >- + cifmw_openshift_adp_s3_access_key and cifmw_openshift_adp_s3_secret_key + are required. Deploy an S3 backend first (e.g. deploy_minio role). + when: >- + cifmw_openshift_adp_s3_access_key is not defined or + cifmw_openshift_adp_s3_secret_key is not defined + +- name: Print setup header + ansible.builtin.debug: + msg: + - "========================================" + - "OADP Setup" + - "========================================" + - "OADP Namespace: {{ cifmw_openshift_adp_namespace }}" + - "OADP Channel: {{ cifmw_openshift_adp_channel }}" + - "S3 Namespace: {{ cifmw_openshift_adp_s3_namespace }}" + - "S3 Bucket: {{ cifmw_openshift_adp_s3_bucket }}" + - "Node Agent (Kopia): {{ cifmw_openshift_adp_enable_node_agent }}" + +- name: Create OADP namespace + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: "{{ cifmw_openshift_adp_namespace }}" + state: present + +- name: Create OperatorGroup for OADP + kubernetes.core.k8s: + state: present + definition: + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: openshift-adp-operator-group + namespace: "{{ cifmw_openshift_adp_namespace }}" + spec: + targetNamespaces: + - "{{ cifmw_openshift_adp_namespace }}" + +- name: Create Subscription for OADP operator + kubernetes.core.k8s: + state: present + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: Subscription + metadata: + name: redhat-oadp-operator + namespace: "{{ cifmw_openshift_adp_namespace }}" + spec: + channel: "{{ cifmw_openshift_adp_channel }}" + installPlanApproval: Automatic + name: redhat-oadp-operator + source: redhat-operators + sourceNamespace: openshift-marketplace + +- name: Wait for OADP operator to be ready + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_openshift_adp_namespace }}" + label_selectors: + - control-plane=controller-manager + wait: true + wait_timeout: 300 + wait_condition: + type: Ready + status: "True" + register: _operator_wait + retries: 30 + delay: 10 + until: _operator_wait.resources | length > 0 + +- name: Create cloud credentials secret + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: Secret + metadata: + name: cloud-credentials + namespace: "{{ cifmw_openshift_adp_namespace }}" + type: Opaque + stringData: + cloud: | + [default] + aws_access_key_id={{ cifmw_openshift_adp_s3_access_key }} + aws_secret_access_key={{ cifmw_openshift_adp_s3_secret_key }} + no_log: true + +- name: Get S3 API route + kubernetes.core.k8s_info: + api_version: route.openshift.io/v1 + kind: Route + name: minio-api + namespace: "{{ cifmw_openshift_adp_s3_namespace }}" + register: _s3_api_route + +- name: Create DataProtectionApplication + kubernetes.core.k8s: + state: present + definition: + apiVersion: oadp.openshift.io/v1alpha1 + kind: DataProtectionApplication + metadata: + name: velero + namespace: "{{ cifmw_openshift_adp_namespace }}" + spec: + configuration: + velero: + defaultPlugins: + - openshift + - aws + - csi + nodeAgent: + enable: "{{ cifmw_openshift_adp_enable_node_agent | bool }}" + uploaderType: kopia + backupLocations: + - velero: + provider: aws + default: true + objectStorage: + bucket: "{{ cifmw_openshift_adp_s3_bucket }}" + prefix: "{{ cifmw_openshift_adp_s3_prefix }}" + config: + region: "{{ cifmw_openshift_adp_s3_region }}" + s3ForcePathStyle: "{{ cifmw_openshift_adp_s3_force_path_style | lower }}" + s3Url: "https://{{ _s3_api_route.resources[0].spec.host }}" + insecureSkipTLSVerify: "{{ cifmw_openshift_adp_s3_insecure_skip_tls | lower }}" + credential: + name: cloud-credentials + key: cloud + +- name: Wait for Velero pod to be ready + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_openshift_adp_namespace }}" + label_selectors: + - app.kubernetes.io/name=velero + wait: true + wait_timeout: 300 + wait_condition: + type: Ready + status: "True" + register: _velero_wait + retries: 30 + delay: 10 + until: _velero_wait.resources | length > 0 + +- name: Wait for node-agent pods to be ready + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_openshift_adp_namespace }}" + label_selectors: + - app.kubernetes.io/name=node-agent + wait: true + wait_timeout: 300 + wait_condition: + type: Ready + status: "True" + when: cifmw_openshift_adp_enable_node_agent | bool + +- name: Get OADP pods + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_openshift_adp_namespace }}" + register: _oadp_pods + +- name: Display OADP pods + ansible.builtin.debug: + msg: "{{ item.metadata.name }} - {{ item.status.phase }}" + loop: "{{ _oadp_pods.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +# ======================================== +# VolumeSnapshotClass for CSI snapshots +# ======================================== +- name: Check for existing VolumeSnapshotClass + kubernetes.core.k8s_info: + api_version: snapshot.storage.k8s.io/v1 + kind: VolumeSnapshotClass + register: _vsc_list + +- name: Create VolumeSnapshotClass for OADP (TopoLVM/LVMS) + kubernetes.core.k8s: + state: present + definition: + apiVersion: snapshot.storage.k8s.io/v1 + kind: VolumeSnapshotClass + metadata: + name: lvms-velero + labels: + velero.io/csi-volumesnapshot-class: "true" + annotations: + snapshot.storage.kubernetes.io/is-default-class: "false" + driver: topolvm.io + deletionPolicy: Retain + when: >- + _vsc_list.resources | + selectattr('driver', 'equalto', 'topolvm.io') | + list | length > 0 + +- name: Print VolumeSnapshotClass status (TopoLVM found) + ansible.builtin.debug: + msg: "VolumeSnapshotClass 'lvms-velero' created for OADP CSI snapshots" + when: >- + _vsc_list.resources | + selectattr('driver', 'equalto', 'topolvm.io') | + list | length > 0 + +- name: Print VolumeSnapshotClass status (no TopoLVM) + ansible.builtin.debug: + msg: "No TopoLVM driver found. If using a different CSI driver, manually create a VolumeSnapshotClass with velero.io/csi-volumesnapshot-class=true label." + when: >- + _vsc_list.resources | + selectattr('driver', 'equalto', 'topolvm.io') | + list | length == 0 + +# ======================================== +# Verify BackupStorageLocation +# ======================================== +- name: Wait for BackupStorageLocation to be available + kubernetes.core.k8s_info: + api_version: velero.io/v1 + kind: BackupStorageLocation + namespace: "{{ cifmw_openshift_adp_namespace }}" + register: _bsl_status + retries: 30 + delay: 10 + until: + - _bsl_status.resources | length > 0 + - (_bsl_status.resources[0].status.phase | default('')) == 'Available' + ignore_errors: true + +- name: Get BackupStorageLocation details + kubernetes.core.k8s_info: + api_version: velero.io/v1 + kind: BackupStorageLocation + namespace: "{{ cifmw_openshift_adp_namespace }}" + register: _bsl_output + +- name: Display BackupStorageLocation status + ansible.builtin.debug: + msg: >- + {{ item.metadata.name }} - Phase: {{ item.status.phase | default('Unknown') }} + loop: "{{ _bsl_output.resources }}" + loop_control: + label: "{{ item.metadata.name }}" + +- name: Get troubleshooting info if BSL not available + ansible.builtin.shell: | + echo "=== BackupStorageLocation ===" + oc get backupstoragelocation -n {{ cifmw_openshift_adp_namespace }} -o yaml + echo "" + echo "=== Velero Logs (last 50 lines) ===" + oc logs -n {{ cifmw_openshift_adp_namespace }} deployment/velero --tail=50 + register: _bsl_debug + changed_when: false + when: _bsl_status is failed + +- name: Display troubleshooting info + ansible.builtin.debug: + msg: "{{ _bsl_debug.stdout_lines }}" + when: _bsl_status is failed + +- name: Print success summary + ansible.builtin.debug: + msg: + - "========================================" + - "OADP Setup Complete" + - "========================================" + - "" + - "OADP Namespace: {{ cifmw_openshift_adp_namespace }}" + - "S3 API: https://{{ _s3_api_route.resources[0].spec.host }}" + - "Bucket: {{ cifmw_openshift_adp_s3_bucket }}" + - "BackupStorageLocation: Available" + when: _bsl_status is not failed + +- name: Print warning summary + ansible.builtin.debug: + msg: + - "========================================" + - "OADP Setup Complete with Warnings" + - "========================================" + - "" + - "BackupStorageLocation is not yet available." + - "" + - "Troubleshoot:" + - " oc get backupstoragelocation -n {{ cifmw_openshift_adp_namespace }} -o yaml" + - " oc logs -n {{ cifmw_openshift_adp_namespace }} deployment/velero" + when: _bsl_status is failed diff --git a/zuul.d/molecule.yaml b/zuul.d/molecule.yaml index d6ff17cf9..9f44f4685 100644 --- a/zuul.d/molecule.yaml +++ b/zuul.d/molecule.yaml @@ -1028,6 +1028,15 @@ - ^.config/molecule/.* name: cifmw-molecule-openshift_adm parent: cifmw-molecule-noop +- job: + files: + - ^common-requirements.txt + - ^test-requirements.txt + - ^roles/openshift_adp/.* + - ^ci/playbooks/molecule.* + - ^.config/molecule/.* + name: cifmw-molecule-openshift_adp + parent: cifmw-molecule-noop - job: files: - ^common-requirements.txt diff --git a/zuul.d/projects.yaml b/zuul.d/projects.yaml index e6e3a9ad9..6ea6ca035 100644 --- a/zuul.d/projects.yaml +++ b/zuul.d/projects.yaml @@ -76,6 +76,7 @@ - cifmw-molecule-nat64_appliance - cifmw-molecule-networking_mapper - cifmw-molecule-openshift_adm + - cifmw-molecule-openshift_adp - cifmw-molecule-openshift_login - cifmw-molecule-openshift_obs - cifmw-molecule-openshift_provisioner_node From 968c8c06eed2a682c6d2e0f2acb784768bf5f36c Mon Sep 17 00:00:00 2001 From: Andrew Bays Date: Wed, 6 May 2026 09:40:40 -0400 Subject: [PATCH 18/22] [cifmw_backup_restore] Add backup/restore orchestration role Orchestrate backup, restore, and cleanup of OpenStack control plane and data plane resources, including Galera database dumps, Velero CSI volume snapshots, and ordered multi-phase restore sequences. Also adds playbooks (backup_restore.yaml) and integrates backup and restore into the post-deployment pipeline. Signed-off-by: Andrew Bays Signed-off-by: Martin Schuppert Co-Authored-By: Claude Opus 4.6 Signed-off-by: Martin Schuppert --- docs/dictionary/en-custom.txt | 5 + playbooks/backup_restore.yaml | 68 +++ post-deployment.yml | 12 + roles/cifmw_backup_restore/README.md | 104 ++++ roles/cifmw_backup_restore/defaults/main.yml | 76 +++ roles/cifmw_backup_restore/meta/main.yml | 31 + .../tasks/_delete_all_of_kind.yml | 49 ++ roles/cifmw_backup_restore/tasks/backup.yml | 320 +++++++++++ roles/cifmw_backup_restore/tasks/cleanup.yml | 330 +++++++++++ roles/cifmw_backup_restore/tasks/e2e.yml | 254 +++++++++ roles/cifmw_backup_restore/tasks/main.yml | 25 + .../tasks/ovn_db_backup.yml | 83 +++ .../tasks/ovn_db_restore.yml | 124 ++++ roles/cifmw_backup_restore/tasks/restore.yml | 535 ++++++++++++++++++ .../tasks/restore_pin_pvcs.yml | 114 ++++ .../tasks/setup_galerabackup.yml | 88 +++ .../tasks/wait_for_restore.yml | 79 +++ .../00-resource-modifiers-configmap.yaml.j2 | 39 ++ .../01-restore-order-00-pvcs.yaml.j2 | 17 + .../02-restore-order-10-foundation.yaml.j2 | 19 + ...03-restore-order-20-infrastructure.yaml.j2 | 20 + .../04-restore-order-30-controlplane.yaml.j2 | 19 + .../05-restore-order-40-backup-config.yaml.j2 | 18 + .../templates/06a-galerarestore.yaml.j2 | 14 + .../07-restore-order-60-dataplane.yaml.j2 | 18 + .../templates/08-edpm-deployment.yaml.j2 | 12 + .../templates/backup-pvcs.yaml.j2 | 48 ++ .../templates/backup-resources.yaml.j2 | 20 + .../templates/galerabackup.yaml.j2 | 13 + zuul.d/molecule.yaml | 9 + zuul.d/projects.yaml | 1 + 31 files changed, 2564 insertions(+) create mode 100644 playbooks/backup_restore.yaml create mode 100644 roles/cifmw_backup_restore/README.md create mode 100644 roles/cifmw_backup_restore/defaults/main.yml create mode 100644 roles/cifmw_backup_restore/meta/main.yml create mode 100644 roles/cifmw_backup_restore/tasks/_delete_all_of_kind.yml create mode 100644 roles/cifmw_backup_restore/tasks/backup.yml create mode 100644 roles/cifmw_backup_restore/tasks/cleanup.yml create mode 100644 roles/cifmw_backup_restore/tasks/e2e.yml create mode 100644 roles/cifmw_backup_restore/tasks/main.yml create mode 100644 roles/cifmw_backup_restore/tasks/ovn_db_backup.yml create mode 100644 roles/cifmw_backup_restore/tasks/ovn_db_restore.yml create mode 100644 roles/cifmw_backup_restore/tasks/restore.yml create mode 100644 roles/cifmw_backup_restore/tasks/restore_pin_pvcs.yml create mode 100644 roles/cifmw_backup_restore/tasks/setup_galerabackup.yml create mode 100644 roles/cifmw_backup_restore/tasks/wait_for_restore.yml create mode 100644 roles/cifmw_backup_restore/templates/00-resource-modifiers-configmap.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/01-restore-order-00-pvcs.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/02-restore-order-10-foundation.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/03-restore-order-20-infrastructure.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/04-restore-order-30-controlplane.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/05-restore-order-40-backup-config.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/07-restore-order-60-dataplane.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/08-edpm-deployment.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/backup-pvcs.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/backup-resources.yaml.j2 create mode 100644 roles/cifmw_backup_restore/templates/galerabackup.yaml.j2 diff --git a/docs/dictionary/en-custom.txt b/docs/dictionary/en-custom.txt index 0e0becdd6..2c2b0ef10 100644 --- a/docs/dictionary/en-custom.txt +++ b/docs/dictionary/en-custom.txt @@ -232,6 +232,7 @@ fsid fultonj fusco fwcybtb +Galera gapped genericcloud genindex @@ -504,6 +505,8 @@ psathyan pubkey publicdomain pullsecret +PVC +PVCs pvs pwd pxe @@ -580,6 +583,7 @@ sso stateful stderr stdout +StorageClass stp str stricthostkeychecking @@ -667,6 +671,7 @@ vvvv vxlan vynxgdagahaac vzcg +WaitForFirstConsumer websso wget whitebox diff --git a/playbooks/backup_restore.yaml b/playbooks/backup_restore.yaml new file mode 100644 index 000000000..1214ab1a4 --- /dev/null +++ b/playbooks/backup_restore.yaml @@ -0,0 +1,68 @@ +--- +# End-to-end backup/restore test playbook +# +# Aligns with the openstack-k8s-operators backup-restore user guide (Galera, +# optional OVN NB/SB on PVC, OADP, ordered restore, Neutron–OVN sync post-EDPM). +# +# Used standalone or from post-deployment.yml (gated by +# cifmw_run_backup_restore_test). Logic lives in +# roles/cifmw_backup_restore/tasks/e2e.yml; variables are in the role defaults. +# +# Each step can be enabled/disabled independently for iterative testing. +# +# Prerequisites: +# - OpenStack control plane deployed and healthy +# - OpenStackBackupConfig CR created (for backup labeling) +# - For manual testing on a reproducer, run post_deployment.sh first: +# ./post_deployment.sh -e zuul_log_collection=true \ +# -e cifmw_nolog=false -e cifmw_run_tests=false +# +# Manual usage (reproducer): +# COMMON_ARGS="-i ~/ci-framework-data/artifacts/zuul_inventory.yml \ +# -e @~/ci-framework-data/parameters/reproducer-variables.yml \ +# -e @~/ci-framework-data/parameters/openshift-environment.yml" +# +# # Full run (with test workload): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_create_workload=true +# +# # Full run (without workload): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml +# +# # Install deps only: +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_run_backup=false \ +# -e cifmw_backup_restore_run_cleanup=false \ +# -e cifmw_backup_restore_run_restore=false +# +# # Backup only (deps already installed): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_install_deps=false \ +# -e cifmw_backup_restore_run_cleanup=false \ +# -e cifmw_backup_restore_run_restore=false +# +# # Cleanup + restore (backup already done): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_install_deps=false \ +# -e cifmw_backup_restore_run_backup=false \ +# -e cifmw_backup_restore_backup_timestamp=20260323-144546 +# +# # Restore only (cleanup already done): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_install_deps=false \ +# -e cifmw_backup_restore_run_backup=false \ +# -e cifmw_backup_restore_run_cleanup=false \ +# -e cifmw_backup_restore_backup_timestamp=20260323-144546 +# +# # With PVC pinning (WaitForFirstConsumer storage): +# ansible-playbook $COMMON_ARGS playbooks/backup_restore.yaml \ +# -e cifmw_backup_restore_pin_pvcs=true + +- name: Backup and Restore end-to-end test + hosts: "{{ cifmw_target_host | default('localhost') }}" + gather_facts: true + tasks: + - name: Run backup/restore end-to-end orchestration + ansible.builtin.import_role: + name: cifmw_backup_restore + tasks_from: e2e.yml diff --git a/post-deployment.yml b/post-deployment.yml index 746d3eaa7..b542532b2 100644 --- a/post-deployment.yml +++ b/post-deployment.yml @@ -73,6 +73,18 @@ tags: - compliance +- name: Run backup and restore test + hosts: "{{ cifmw_target_host | default('localhost') }}" + gather_facts: true + tasks: + - name: Run backup/restore end-to-end orchestration + ansible.builtin.import_role: + name: cifmw_backup_restore + tasks_from: e2e.yml + when: cifmw_run_backup_restore_test | default(false) | bool + tags: + - backup-restore + - name: Run hooks and inject status flag hosts: "{{ cifmw_target_host | default('localhost') }}" gather_facts: true diff --git a/roles/cifmw_backup_restore/README.md b/roles/cifmw_backup_restore/README.md new file mode 100644 index 000000000..a7ccce392 --- /dev/null +++ b/roles/cifmw_backup_restore/README.md @@ -0,0 +1,104 @@ +# cifmw_backup_restore + +Automate OpenStack on OpenShift backup and restore operations using OADP +(OpenShift API for Data Protection) and Velero. The role supports three +actions: **backup**, **restore**, and **cleanup**. + +- **backup** — creates Galera database dumps, optionally backs up OVN NB/SB + databases onto their PVCs, then creates Velero backups of labeled PVCs + (via CSI snapshots) and cluster resources. +- **restore** — performs an ordered Velero restore sequence (PVCs, + foundation, infrastructure, control plane, Galera, optional OVN file restore, + full control plane resume, dataplane, EDPM), then Neutron–OVN verification and + sync (**log** mode, then **repair**, matching the backup-restore user guide Step 12). +- **cleanup** — tears down dataplane and control-plane resources so the + namespace is ready for a fresh restore. + +## Privilege escalation + +None. All cluster operations are performed through `oc` against the target +OpenShift cluster. + +## Parameters + +### Common + +* `cifmw_backup_restore_action`: (String) Action to perform. Must be one of `backup`, `restore`, or `cleanup`. Defaults to `""` (role will fail if unset). +* `cifmw_backup_restore_namespace`: (String) Target OpenStack namespace. Defaults to `openstack`. +* `cifmw_backup_restore_oadp_namespace`: (String) Namespace where Velero/OADP is running. Defaults to `openshift-adp`. +* `cifmw_backup_restore_auto_ack`: (Boolean) Skip interactive pause prompts when `true`. Defaults to `false`. +* `cifmw_backup_restore_ovn_db`: (Boolean) When `true` (default), the **backup** path labels OVN NB/SB PVCs and runs `ovsdb-client` backup before the OADP PVC backup, and the **restore** path runs OVN NB/SB file restore after Galera (when timestamped files exist on the PVC) before resuming the full control plane. Set to `false` to skip both; post-EDPM `neutron-ovn-db-sync` still runs when OVN files were not backed up. +* `cifmw_backup_restore_ovn_db_ready_timeout`: (String) Timeout for `oc wait` on OVN database pods during OVN backup/restore. Defaults to `5m`. + +### Backup + +* `cifmw_backup_restore_galera_backup_timeout`: (String) Timeout for `oc wait` on Galera backup jobs. Defaults to `10m`. +* `cifmw_backup_restore_galera_storage_class`: (String) StorageClass for Galera backup PVCs. Empty string uses the cluster default. Defaults to `""`. +* `cifmw_backup_restore_galera_storage_request`: (String) Size of the Galera backup PVC. Defaults to `5Gi`. +* `cifmw_backup_restore_galera_transfer_storage_request`: (String) Size of the Galera transfer storage PVC. Defaults to `5Gi`. +* `cifmw_backup_restore_oadp_backup_timeout`: (String) Timeout for OADP PVC and resource backup completion. Defaults to `30m`. +* `cifmw_backup_restore_storage_location`: (String) Velero `BackupStorageLocation` name. Defaults to `velero-1`. +* `cifmw_backup_restore_backup_ttl`: (String) TTL for Velero backups. Defaults to `720h`. +* `cifmw_backup_restore_snapshot_move_data`: (Boolean) Enable Velero snapshot data mover. When `true`, cleanup also deletes labeled PVCs. Defaults to `true`. + +### Restore + +* `cifmw_backup_restore_backup_timestamp`: (String) Timestamp suffix that identifies the backup to restore (e.g. `20260311-081234`). **Required** when `cifmw_backup_restore_action` is `restore`. +* `cifmw_backup_restore_restore_timeout`: (Integer) Seconds to wait for each Velero Restore to reach a terminal phase. Defaults to `900`. +* `cifmw_backup_restore_infra_ready_timeout`: (String) Timeout for `oc wait` on `OpenStackControlPlaneInfrastructureReady`. Defaults to `20m`. +* `cifmw_backup_restore_ctlplane_ready_timeout`: (String) Timeout for `oc wait` on control plane `Ready` after removing the deployment-stage annotation. Defaults to `10m`. +* `cifmw_backup_restore_strict_restore`: (Boolean) Fail on Velero `PartiallyFailed` status when `true`; only warn when `false`. Defaults to `true`. +* `cifmw_backup_restore_restore_content`: (String) Content flag passed to `restore_galera` (`--content`). Defaults to `data`. +* `cifmw_backup_restore_edpm_deploy_timeout`: (String) Timeout for `oc wait` on the post-restore EDPM deployment. Defaults to `40m`. +* `cifmw_backup_restore_pin_pvcs`: (Boolean) Enable PVC-to-node pinning during restore for WaitForFirstConsumer storage classes. Defaults to `false`. +* Post-EDPM **Neutron–OVN** steps follow [user guide Step 12](https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-12-verify-and-sync-neutron-to-ovn): run `neutron-ovn-db-sync-util` in `log` mode first (`neutron-dist.conf`, `neutron.conf`, `neutron.conf.d`). **Repair** runs if `cifmw_backup_restore_ovn_db` is `false` (no OVN NB/SB file backup was taken), or if log-mode stdout/stderr contains a `WARNING` line—Neutron reports drift that way while still exiting 0. If OVN file backup/restore was enabled and log output has no `WARNING` lines, repair is skipped as redundant. + +### Cleanup + +* `cifmw_backup_restore_cleanup_ctlplane`: (Boolean) Delete control-plane resources during cleanup. Defaults to `true`. +* `cifmw_backup_restore_cleanup_dataplane`: (Boolean) Delete dataplane resources during cleanup. Defaults to `true`. + +## Examples + +### Running a backup + +```YAML +- hosts: localhost + tasks: + - name: Backup OpenStack + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: backup + cifmw_backup_restore_namespace: openstack + cifmw_backup_restore_auto_ack: true +``` + +### Restoring from a backup + +```YAML +- hosts: localhost + tasks: + - name: Restore OpenStack + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: restore + cifmw_backup_restore_backup_timestamp: "20260311-081234" + cifmw_backup_restore_auto_ack: true +``` + +### Cleaning up before a restore + +```YAML +- hosts: localhost + tasks: + - name: Cleanup namespace + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: cleanup + cifmw_backup_restore_auto_ack: true + cifmw_backup_restore_cleanup_ctlplane: true + cifmw_backup_restore_cleanup_dataplane: true +``` diff --git a/roles/cifmw_backup_restore/defaults/main.yml b/roles/cifmw_backup_restore/defaults/main.yml new file mode 100644 index 000000000..91493fb82 --- /dev/null +++ b/roles/cifmw_backup_restore/defaults/main.yml @@ -0,0 +1,76 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +# All variables intended for modification should be placed in this file. +# All variables within this role should have a prefix of "cifmw_backup_restore" + +# Action to perform: backup, restore, or cleanup +cifmw_backup_restore_action: "" + +# Common +cifmw_backup_restore_namespace: "{{ cifmw_openstack_namespace | default('openstack') }}" +cifmw_backup_restore_oadp_namespace: openshift-adp +cifmw_backup_restore_auto_ack: false + +# End-to-end orchestration (tasks/e2e.yml; invoked from post-deployment or playbooks/backup_restore.yaml) +cifmw_backup_restore_install_deps: true +cifmw_backup_restore_create_workload: true +cifmw_backup_restore_run_backup: true +cifmw_backup_restore_run_cleanup: true +cifmw_backup_restore_run_restore: true +cifmw_backup_restore_run_post_tempest: false + +# Passthrough to update role when creating the test workload (prefix matches update role, not this role) +cifmw_update_ping_test: true +cifmw_update_control_plane_check: false +cifmw_update_artifacts_basedir_suffix: "tests/update" +cifmw_update_artifacts_basedir: "{{ ansible_user_dir }}/ci-framework-data/{{ cifmw_update_artifacts_basedir_suffix }}" +cifmw_update_workload_launch_script: "{{ cifmw_update_artifacts_basedir }}/workload_launch.sh" +cifmw_update_timestamper_cmd: >- + | awk '{ print strftime("%Y-%m-%d %H:%M:%S |"), $0; fflush(); }' +cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_start_ping.sh" +cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh" +cifmw_update_namespace: "{{ cifmw_backup_restore_namespace }}" + +# Backup +cifmw_backup_restore_galera_backup_timeout: 10m +cifmw_backup_restore_galera_storage_class: "" +cifmw_backup_restore_galera_storage_request: 5Gi +cifmw_backup_restore_galera_transfer_storage_request: 5Gi +cifmw_backup_restore_oadp_backup_timeout: 30m +cifmw_backup_restore_storage_location: velero-1 +cifmw_backup_restore_backup_ttl: 720h +cifmw_backup_restore_snapshot_move_data: true +cifmw_backup_restore_swift_xattr_timeout: 600s + +# OVN NB/SB database files on PVCs (user-guide backup Step 3 / restore Step 8) +cifmw_backup_restore_ovn_db: true +cifmw_backup_restore_ovn_db_ready_timeout: 5m + +# Restore +# cifmw_backup_restore_backup_timestamp: REQUIRED for restore (e.g., 20260311-081234) +cifmw_backup_restore_restore_timeout: 900 +cifmw_backup_restore_edpm_deploy_timeout: 40m +cifmw_backup_restore_infra_ready_timeout: 20m +cifmw_backup_restore_ctlplane_ready_timeout: 10m +cifmw_backup_restore_strict_restore: true +cifmw_backup_restore_restore_content: data +cifmw_backup_restore_pin_pvcs: false + +# Cleanup +cifmw_backup_restore_cleanup_ctlplane: true +cifmw_backup_restore_cleanup_dataplane: true diff --git a/roles/cifmw_backup_restore/meta/main.yml b/roles/cifmw_backup_restore/meta/main.yml new file mode 100644 index 000000000..3a28894d9 --- /dev/null +++ b/roles/cifmw_backup_restore/meta/main.yml @@ -0,0 +1,31 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + + +galaxy_info: + author: CI Framework + description: CI Framework Role -- OpenStack Backup and Restore + company: Red Hat + license: Apache-2.0 + min_ansible_version: "2.14" + namespace: cifmw + galaxy_tags: + - cifmw + - openstack + - backup + - restore + +dependencies: [] diff --git a/roles/cifmw_backup_restore/tasks/_delete_all_of_kind.yml b/roles/cifmw_backup_restore/tasks/_delete_all_of_kind.yml new file mode 100644 index 000000000..27d9ba986 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/_delete_all_of_kind.yml @@ -0,0 +1,49 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Helper: delete all resources of a given kind in the backup/restore namespace. +# +# Required variables: +# _resource_api_version - e.g. "core.openstack.org/v1beta1" +# _resource_kind - e.g. "OpenStackControlPlane" +# +# Optional variables: +# _resource_label_selectors - label selectors list (default: omitted) +# _resource_wait - wait for deletion (default: false) +# _resource_wait_timeout - wait timeout in seconds (default: 120) + +- name: "List resources - {{ _resource_kind }}" + kubernetes.core.k8s_info: + api_version: "{{ _resource_api_version }}" + kind: "{{ _resource_kind }}" + namespace: "{{ cifmw_backup_restore_namespace }}" + label_selectors: "{{ _resource_label_selectors | default(omit) }}" + register: _resources_to_delete + failed_when: false + +- name: "Delete resources - {{ _resource_kind }}" + kubernetes.core.k8s: + api_version: "{{ _resource_api_version }}" + kind: "{{ _resource_kind }}" + namespace: "{{ cifmw_backup_restore_namespace }}" + name: "{{ item.metadata.name }}" + state: absent + wait: "{{ _resource_wait | default(false) | bool }}" + wait_timeout: "{{ _resource_wait_timeout | default(120) }}" + loop: "{{ _resources_to_delete.resources | default([]) }}" + loop_control: + label: "{{ item.metadata.name }}" + failed_when: false diff --git a/roles/cifmw_backup_restore/tasks/backup.yml b/roles/cifmw_backup_restore/tasks/backup.yml new file mode 100644 index 000000000..333a2d66c --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/backup.yml @@ -0,0 +1,320 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# OpenStack Full Backup +# +# 1. Trigger Galera database dumps (creates fresh DB dumps on PVCs) +# 2. Optional: label OVN PVCs and ovsdb-client backup (NB/SB) onto PVCs +# 3. OADP PVC backup (CSI snapshots of labeled PVCs) +# 4. OADP resources backup (CRs, Secrets, ConfigMaps, NADs, etc.) + +# ======================================== +# Pre-flight checks +# ======================================== +- name: Verify OADP operator is installed + ansible.builtin.shell: | + oc get crd backups.velero.io -o name + register: oadp_crd_check + changed_when: false + failed_when: false + +- name: Fail if OADP is not installed + ansible.builtin.fail: + msg: | + OADP operator is not installed (Velero Backup CRD not found). + Install OADP first. + when: oadp_crd_check.rc != 0 + +- name: Verify VolumeSnapshotClass with Velero label exists + ansible.builtin.shell: | + oc get volumesnapshotclass -l velero.io/csi-volumesnapshot-class=true -o name + register: vsc_check + changed_when: false + failed_when: false + +- name: Fail if no VolumeSnapshotClass with Velero label + ansible.builtin.fail: + msg: | + No VolumeSnapshotClass found with label velero.io/csi-volumesnapshot-class=true. + Label your VolumeSnapshotClass for Velero CSI snapshots. + when: vsc_check.rc != 0 or vsc_check.stdout == "" + +- name: Verify OpenStackBackupConfig exists + ansible.builtin.shell: | + oc get openstackbackupconfig -n {{ cifmw_backup_restore_namespace }} -o jsonpath='{.items[0].metadata.name}' + register: _backupconfig_check + changed_when: false + failed_when: false + +- name: Fail if OpenStackBackupConfig is not deployed + ansible.builtin.fail: + msg: | + No OpenStackBackupConfig CR found in namespace {{ cifmw_backup_restore_namespace }}. + The BackupConfig controller labels secrets, configmaps, and other resources + for backup/restore. Without it, user-provided resources (e.g. osp-secret) + will not be restored. + Create an OpenStackBackupConfig CR before running backup. + when: _backupconfig_check.rc != 0 or _backupconfig_check.stdout == "" + +- name: Verify resources are labeled for restore + ansible.builtin.shell: | + set -o pipefail + SECRET_COUNT=$(oc get secret -n {{ cifmw_backup_restore_namespace }} -l backup.openstack.org/restore=true --no-headers 2>/dev/null | wc -l) + CM_COUNT=$(oc get configmap -n {{ cifmw_backup_restore_namespace }} -l backup.openstack.org/restore=true --no-headers 2>/dev/null | wc -l) + echo "Labeled secrets: ${SECRET_COUNT}" + echo "Labeled configmaps: ${CM_COUNT}" + if [ "${SECRET_COUNT}" -eq 0 ]; then + echo "WARNING: No secrets labeled for restore" >&2 + exit 1 + fi + register: _labeled_resources_check + changed_when: false + failed_when: false + +- name: Display labeled resource counts + ansible.builtin.debug: + msg: "{{ _labeled_resources_check.stdout_lines }}" + +- name: Fail if no resources are labeled + ansible.builtin.fail: + msg: | + No secrets are labeled with backup.openstack.org/restore=true. + The OpenStackBackupConfig controller may not have reconciled yet. + Check: oc get openstackbackupconfig -n {{ cifmw_backup_restore_namespace }} -o yaml + when: _labeled_resources_check.rc != 0 + +- name: Set backup timestamp + ansible.builtin.set_fact: + cifmw_backup_restore_backup_name_suffix: "{{ lookup('pipe', 'date +%Y%m%d-%H%M%S') }}" + +- name: Collect operator version information + ansible.builtin.shell: | + oc get csv -n openstack-operators \ + -l operators.coreos.com/openstack-operator.openstack-operators \ + -o jsonpath='{.items[0].metadata.name}' + register: _operator_csv_version + changed_when: false + failed_when: false + +- name: Fail if OpenStack operator CSV could not be determined + ansible.builtin.fail: + msg: | + Could not read OpenStack operator ClusterServiceVersion in namespace openstack-operators + (oc get csv -l operators.coreos.com/openstack-operator.openstack-operators). + when: > + (_operator_csv_version.rc | default(-1)) != 0 or + ((_operator_csv_version.stdout | default('')) | trim | length) == 0 + +- name: Collect catalog source image + ansible.builtin.shell: | + oc get catalogsource -n openstack-operators \ + -o jsonpath='{.items[0].spec.image}' + register: _operator_catalog_image + changed_when: false + failed_when: false + +- name: Collect operator image + ansible.builtin.shell: | + oc get deployment openstack-operator-controller-manager \ + -n openstack-operators -o jsonpath='{.spec.template.spec.containers[0].image}' + register: _operator_image + changed_when: false + failed_when: false + +- name: Set operator version facts + ansible.builtin.set_fact: + _backup_csv_version: "{{ _operator_csv_version.stdout }}" + _backup_catalog_image: "{{ _operator_catalog_image.stdout | default('unknown') }}" + _backup_operator_image: "{{ _operator_image.stdout | default('unknown') }}" + +- name: Display operator version information + ansible.builtin.debug: + msg: + - "CSV version: {{ _backup_csv_version }}" + - "Catalog image: {{ _backup_catalog_image }}" + - "Operator image: {{ _backup_operator_image }}" + +- name: Create temp directory for rendered templates + ansible.builtin.tempfile: + state: directory + prefix: openstack-backup- + register: _cifmw_backup_restore_rendered_dir + +# ======================================== +# Step 1: Trigger Galera Database Dumps +# ======================================== +# GaleraBackup CRs and cronjobs are created by setup_galerabackup.yml +# (run as part of install_deps). This step only triggers the dump jobs. +- name: "Step 1: Trigger Galera database dumps" + ansible.builtin.debug: + msg: + - "========================================" + - "Step 1: Trigger Galera Database Dumps" + - "========================================" + +- name: Get Galera backup cronjobs + ansible.builtin.shell: | + oc get cronjob -n {{ cifmw_backup_restore_namespace }} -l app=galera -o jsonpath='{.items[*].metadata.name}' + register: _galera_backup_cronjobs + changed_when: false + +- name: Fail if no GaleraBackup cronjobs found + ansible.builtin.fail: + msg: | + No GaleraBackup cronjobs found. Run with cifmw_backup_restore_install_deps=true + first to create GaleraBackup CRs. + when: _galera_backup_cronjobs.stdout == "" + +- name: Trigger Galera backup jobs + ansible.builtin.shell: | + set -o pipefail + BACKUP_JOB_NAME="{{ item }}-{{ cifmw_backup_restore_backup_name_suffix }}" + oc -n {{ cifmw_backup_restore_namespace }} create job --from=cronjob/{{ item }} ${BACKUP_JOB_NAME} \ + --dry-run=client -o json | \ + jq '.spec.template.spec.containers[0].env += [{"name":"BACKUP_TIMESTAMP","value":"{{ cifmw_backup_restore_backup_name_suffix }}"}]' | \ + oc -n {{ cifmw_backup_restore_namespace }} create -f - + echo ${BACKUP_JOB_NAME} + loop: "{{ _galera_backup_cronjobs.stdout.split() }}" + register: _galera_backup_jobs + changed_when: true + when: _galera_backup_cronjobs.stdout != "" + +- name: Wait for Galera backup jobs to complete + ansible.builtin.shell: | + oc -n {{ cifmw_backup_restore_namespace }} wait --for=condition=complete job/{{ item.stdout_lines[-1] }} --timeout={{ cifmw_backup_restore_galera_backup_timeout }} + loop: "{{ _galera_backup_jobs.results }}" + changed_when: false + when: _galera_backup_cronjobs.stdout != "" + +# ======================================== +# Step 2: OVN database backup (label PVCs; ovsdb-client backup) +# ======================================== +- name: Include OVN database backup tasks + ansible.builtin.include_tasks: ovn_db_backup.yml + when: cifmw_backup_restore_ovn_db | bool + +# ======================================== +# Step 3: OADP PVC Backup (CSI Snapshots) +# ======================================== +- name: List PVCs marked for backup + ansible.builtin.shell: | + oc get pvc -n {{ cifmw_backup_restore_namespace }} -l backup.openstack.org/backup=true \ + -o custom-columns=NAME:.metadata.name,SIZE:.spec.resources.requests.storage --no-headers + register: _labeled_pvcs + changed_when: false + failed_when: false + +- name: Print labeled PVCs + ansible.builtin.debug: + msg: "{{ _labeled_pvcs.stdout_lines }}" + when: _labeled_pvcs.stdout_lines | default([]) | length > 0 + +- name: Render PVC backup CR + ansible.builtin.template: + src: backup-pvcs.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/backup-pvcs.yaml" + mode: "0644" + +- name: Pause before PVC backup + ansible.builtin.pause: + prompt: >- + Step 1 complete: Galera DB dumps{% if cifmw_backup_restore_ovn_db | bool %} + ; Step 2 complete: OVN DB backups on PVCs{% endif %}. + Press Enter to create PVC backup, or Ctrl+C then 'A' to abort + when: not (cifmw_backup_restore_auto_ack | bool) + +- name: Create OADP PVC backup + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/backup-pvcs.yaml" + state: present + +- name: Wait for PVC backup to complete + ansible.builtin.command: + cmd: >- + oc get backup openstack-backup-pvcs-{{ cifmw_backup_restore_backup_name_suffix }} + -n {{ cifmw_backup_restore_oadp_namespace }} + -o jsonpath='{.status.phase}' + register: _pvc_backup_phase + changed_when: false + until: _pvc_backup_phase.stdout in ["Completed", "Failed", "PartiallyFailed"] + retries: "{{ (cifmw_backup_restore_oadp_backup_timeout | regex_replace('[^0-9]', '') | int * 60 / 10) | int }}" + delay: 10 + +- name: Fail if PVC backup did not complete + ansible.builtin.fail: + msg: "PVC backup ended with phase: {{ _pvc_backup_phase.stdout }}" + when: _pvc_backup_phase.stdout != "Completed" + +# ======================================== +# Step 4: OADP Resources Backup +# ======================================== +- name: Render resources backup CR + ansible.builtin.template: + src: backup-resources.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/backup-resources.yaml" + mode: "0644" + +- name: Pause before resources backup + ansible.builtin.pause: + prompt: >- + Step 3 complete: PVC backup. Press Enter to create resources backup, or Ctrl+C then 'A' to abort + when: not (cifmw_backup_restore_auto_ack | bool) + +- name: Create OADP resources backup + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/backup-resources.yaml" + state: present + +- name: Wait for resources backup to complete + ansible.builtin.command: + cmd: >- + oc get backup openstack-backup-resources-{{ cifmw_backup_restore_backup_name_suffix }} + -n {{ cifmw_backup_restore_oadp_namespace }} + -o jsonpath='{.status.phase}' + register: _resources_backup_phase + changed_when: false + until: _resources_backup_phase.stdout in ["Completed", "Failed", "PartiallyFailed"] + retries: "{{ (cifmw_backup_restore_oadp_backup_timeout | regex_replace('[^0-9]', '') | int * 60 / 10) | int }}" + delay: 10 + +- name: Fail if resources backup did not complete + ansible.builtin.fail: + msg: "Resources backup ended with phase: {{ _resources_backup_phase.stdout }}" + when: _resources_backup_phase.stdout != "Completed" + +# ======================================== +# Summary +# ======================================== +- name: Print backup summary + ansible.builtin.debug: + msg: + - "========================================" + - "Backup Complete" + - "========================================" + - "" + - "Backup name suffix: {{ cifmw_backup_restore_backup_name_suffix }}" + - "PVC backup: openstack-backup-pvcs-{{ cifmw_backup_restore_backup_name_suffix }}" + - "Resources backup: openstack-backup-resources-{{ cifmw_backup_restore_backup_name_suffix }}" + - "" + - "Operator version recorded on Backup CRs:" + - " CSV: {{ _backup_csv_version }}" + - " Catalog: {{ _backup_catalog_image }}" + - " Image: {{ _backup_operator_image }}" + +- name: Cleanup rendered templates + ansible.builtin.file: + path: "{{ _cifmw_backup_restore_rendered_dir.path }}" + state: absent diff --git a/roles/cifmw_backup_restore/tasks/cleanup.yml b/roles/cifmw_backup_restore/tasks/cleanup.yml new file mode 100644 index 000000000..49e161010 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/cleanup.yml @@ -0,0 +1,330 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# OpenStack Cleanup for Restore +# +# Cleans up OpenStack control plane and data plane resources to prepare +# for a restore from OADP backup. + +- name: Print cleanup header + ansible.builtin.debug: + msg: + - "========================================" + - "OpenStack Cleanup for Restore" + - "========================================" + - "Namespace: {{ cifmw_backup_restore_namespace }}" + - "Cleanup ControlPlane: {{ cifmw_backup_restore_cleanup_ctlplane }}" + - "Cleanup DataPlane: {{ cifmw_backup_restore_cleanup_dataplane }}" + - "Delete labeled PVCs (Data Mover): {{ cifmw_backup_restore_snapshot_move_data }}" + +- name: Check if namespace exists + kubernetes.core.k8s_info: + api_version: v1 + kind: Namespace + name: "{{ cifmw_backup_restore_namespace }}" + register: _namespace_check + +- name: Exit if namespace doesn't exist + ansible.builtin.debug: + msg: "Namespace {{ cifmw_backup_restore_namespace }} does not exist. Nothing to clean up." + when: _namespace_check.resources | length == 0 + +- name: End cleanup if namespace doesn't exist + ansible.builtin.meta: end_host + when: _namespace_check.resources | length == 0 + +- name: Gather resource counts + ansible.builtin.shell: | + set -o pipefail + echo "OpenStackControlPlane: $(oc get openstackcontrolplane -n {{ cifmw_backup_restore_namespace }} --no-headers 2>/dev/null | wc -l)" + echo "DataPlaneNodeSets: $(oc get openstackdataplanenodeset -n {{ cifmw_backup_restore_namespace }} --no-headers 2>/dev/null | wc -l)" + echo "GaleraBackup: $(oc get galerabackup -n {{ cifmw_backup_restore_namespace }} --no-headers 2>/dev/null | wc -l)" + echo "Labeled PVCs: $(oc get pvc -n {{ cifmw_backup_restore_namespace }} -l backup.openstack.org/backup=true --no-headers 2>/dev/null | wc -l)" + register: _resource_counts + changed_when: false + +- name: Display found resources + ansible.builtin.debug: + msg: "{{ _resource_counts.stdout_lines }}" + +- name: Confirm deletion + ansible.builtin.pause: + prompt: "WARNING: This will delete OpenStack resources in {{ cifmw_backup_restore_namespace }}. Continue? (yes/no)" + register: _delete_confirm + when: not (cifmw_backup_restore_auto_ack | bool) + +- name: Fail if not confirmed + ansible.builtin.fail: + msg: "Cleanup cancelled by user" + when: not (cifmw_backup_restore_auto_ack | bool) and _delete_confirm.user_input != "yes" + +# ======================================== +# DataPlane Cleanup +# ======================================== +- name: Delete DataPlaneDeployment CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: dataplane.openstack.org/v1beta1 + _resource_kind: OpenStackDataPlaneDeployment + when: cifmw_backup_restore_cleanup_dataplane | bool + +- name: Delete DataPlaneNodeSet CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: dataplane.openstack.org/v1beta1 + _resource_kind: OpenStackDataPlaneNodeSet + when: cifmw_backup_restore_cleanup_dataplane | bool + +- name: Delete DataPlaneService CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: dataplane.openstack.org/v1beta1 + _resource_kind: OpenStackDataPlaneService + when: cifmw_backup_restore_cleanup_dataplane | bool + +- name: Delete NetConfig CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: network.openstack.org/v1beta1 + _resource_kind: NetConfig + when: cifmw_backup_restore_cleanup_dataplane | bool + +- name: Wait for dataplane pods to terminate + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_backup_restore_namespace }}" + label_selectors: + - app=openstackansibleee + register: _dp_pods + until: _dp_pods.resources | length == 0 + retries: 30 + delay: 10 + when: cifmw_backup_restore_cleanup_dataplane | bool + +# ======================================== +# ControlPlane Cleanup +# ======================================== +- name: Delete PVC-pin dummy Deployments + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: apps/v1 + _resource_kind: Deployment + _resource_label_selectors: + - app=pvc-pin + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete openstack-restore-tmp namespace + ansible.builtin.shell: | + if oc get namespace openstack-restore-tmp &>/dev/null; then + for SECRET in $(oc get secret -n openstack-restore-tmp -o name 2>/dev/null); do + oc patch ${SECRET} -n openstack-restore-tmp --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true + done + oc delete namespace openstack-restore-tmp --wait=true --timeout=60s + fi + changed_when: true + failed_when: false + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete GaleraRestore CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: mariadb.openstack.org/v1beta1 + _resource_kind: GaleraRestore + _resource_wait: true + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete GaleraBackup CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: mariadb.openstack.org/v1beta1 + _resource_kind: GaleraBackup + _resource_wait: true + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete OpenStackControlPlane CR + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: core.openstack.org/v1beta1 + _resource_kind: OpenStackControlPlane + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Wait for pods to terminate + ansible.builtin.command: + cmd: >- + oc get pods -n {{ cifmw_backup_restore_namespace }} + --field-selector=status.phase!=Succeeded --no-headers + register: _remaining_pods + changed_when: false + failed_when: false + until: _remaining_pods.stdout_lines | length == 0 + retries: 12 + delay: 5 + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Force delete stuck pods if still remaining + when: + - cifmw_backup_restore_cleanup_ctlplane | bool + - _remaining_pods.stdout_lines | default([]) | length > 0 + block: + - name: Force delete remaining pods + ansible.builtin.command: + cmd: >- + oc delete pods --all -n {{ cifmw_backup_restore_namespace }} + --force --grace-period=0 + changed_when: true + failed_when: false + + - name: Wait for pods after force delete + ansible.builtin.command: + cmd: >- + oc get pods -n {{ cifmw_backup_restore_namespace }} + --field-selector=status.phase!=Succeeded --no-headers + register: _remaining_pods_final + changed_when: false + failed_when: false + until: _remaining_pods_final.stdout_lines | length == 0 + retries: 48 + delay: 5 + +- name: Delete OpenStackBackupConfig CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: core.openstack.org/v1beta1 + _resource_kind: OpenStackBackupConfig + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete OpenStackVersion CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: core.openstack.org/v1beta1 + _resource_kind: OpenStackVersion + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete RabbitMQUser CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: rabbitmq.openstack.org/v1beta1 + _resource_kind: RabbitMQUser + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete Certificate CRs (cert-manager) + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: cert-manager.io/v1 + _resource_kind: Certificate + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: List secrets in namespace + kubernetes.core.k8s_info: + api_version: v1 + kind: Secret + namespace: "{{ cifmw_backup_restore_namespace }}" + register: _all_secrets + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete cert secrets + kubernetes.core.k8s: + api_version: v1 + kind: Secret + namespace: "{{ cifmw_backup_restore_namespace }}" + name: "{{ item.metadata.name }}" + state: absent + loop: >- + {{ _all_secrets.resources | default([]) + | selectattr('metadata.name', 'search', 'cert') + | rejectattr('metadata.name', 'search', 'edpm') + | rejectattr('metadata.name', 'search', 'ceph-conf') + | list }} + loop_control: + label: "{{ item.metadata.name }}" + failed_when: false + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete CA bundle secrets + kubernetes.core.k8s: + api_version: v1 + kind: Secret + namespace: "{{ cifmw_backup_restore_namespace }}" + name: "{{ item }}" + state: absent + loop: + - rootca-internal + - rootca-libvirt + - rootca-ovn + - rootca-public + - combined-ca-bundle + failed_when: false + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete remaining user-provided secrets + kubernetes.core.k8s: + api_version: v1 + kind: Secret + namespace: "{{ cifmw_backup_restore_namespace }}" + name: "{{ item.metadata.name }}" + state: absent + loop: >- + {{ _all_secrets.resources | default([]) + | rejectattr('metadata.name', 'search', 'dockercfg') + | rejectattr('metadata.name', 'search', 'service-account-token') + | list }} + loop_control: + label: "{{ item.metadata.name }}" + failed_when: false + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete ConfigMaps + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: v1 + _resource_kind: ConfigMap + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Delete DNSData CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: network.openstack.org/v1beta1 + _resource_kind: DNSData + when: cifmw_backup_restore_cleanup_ctlplane | bool + +# ======================================== +# Delete all PVCs +# ======================================== +- name: Delete all PVCs in namespace + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: v1 + _resource_kind: PersistentVolumeClaim + when: cifmw_backup_restore_cleanup_ctlplane | bool + +# ======================================== +# Summary +# ======================================== +- name: Verify cleanup + ansible.builtin.shell: | + oc get all,pvc -n {{ cifmw_backup_restore_namespace }} + register: _remaining_resources + changed_when: false + failed_when: false + +- name: Display remaining resources + ansible.builtin.debug: + msg: "{{ _remaining_resources.stdout_lines }}" + +- name: Print cleanup complete + ansible.builtin.debug: + msg: "Namespace {{ cifmw_backup_restore_namespace }} is ready for restore." diff --git a/roles/cifmw_backup_restore/tasks/e2e.yml b/roles/cifmw_backup_restore/tasks/e2e.yml new file mode 100644 index 000000000..b7ae118ae --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/e2e.yml @@ -0,0 +1,254 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# End-to-end backup/restore test orchestration (hooks, deps, workload, +# backup/cleanup/restore, validation, optional tempest). +# Variables: defaults/main.yml (cifmw_backup_restore_install_deps, …). + +- name: Run pre_backup_restore hooks + vars: + step: pre_backup_restore + ansible.builtin.import_role: + name: run_hook + +# ======================================== +# Step 1: Install dependencies +# ======================================== +- name: Setup MinIO + ansible.builtin.include_role: + name: deploy_minio + when: cifmw_backup_restore_install_deps | bool + +- name: Setup OADP + ansible.builtin.include_role: + name: openshift_adp + vars: + cifmw_openshift_adp_s3_access_key: "{{ cifmw_deploy_minio_access_key }}" + cifmw_openshift_adp_s3_secret_key: "{{ cifmw_deploy_minio_secret_key }}" + when: cifmw_backup_restore_install_deps | bool + +- name: Setup GaleraBackup CRs + ansible.builtin.include_role: + name: cifmw_backup_restore + tasks_from: setup_galerabackup.yml + when: cifmw_backup_restore_install_deps | bool + +# ======================================== +# Step 2: Create test workload +# ======================================== +- name: Create test workload + when: cifmw_backup_restore_create_workload | bool + block: + - name: Ensure update artifacts directory exists + ansible.builtin.file: + path: "{{ cifmw_update_artifacts_basedir }}" + state: directory + mode: "0755" + + - name: Generate workload launch script + ansible.builtin.include_role: + name: update + tasks_from: create_test_files.yml + vars: + cifmw_update_ping_test: "{{ cifmw_backup_restore_create_workload | bool }}" + + - name: Create local openstackclient container + ansible.builtin.include_role: + name: update + tasks_from: create_local_openstackclient.yml + + - name: Create the workload instance + ansible.builtin.include_role: + name: update + tasks_from: create_instance.yml + +# ======================================== +# Step 3: Create backup +# ======================================== +- name: Create backup + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: backup + cifmw_backup_restore_auto_ack: true + when: cifmw_backup_restore_run_backup | bool + +- name: Print backup timestamp + ansible.builtin.debug: + msg: "Backup completed with timestamp: {{ cifmw_backup_restore_backup_name_suffix }}" + when: cifmw_backup_restore_run_backup | bool + +# ======================================== +# Step 4: Cleanup namespace +# ======================================== +- name: Cleanup + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: cleanup + cifmw_backup_restore_auto_ack: true + when: cifmw_backup_restore_run_cleanup | bool + +# ======================================== +# Step 5: Restore from backup +# ======================================== +- name: Resolve backup timestamp for restore (from backup run or extra-var) + ansible.builtin.set_fact: + _cifmw_backup_restore_e2e_restore_timestamp: >- + {{ cifmw_backup_restore_backup_name_suffix | default(cifmw_backup_restore_backup_timestamp, true) }} + when: cifmw_backup_restore_run_restore | bool + +- name: Restore + ansible.builtin.include_role: + name: cifmw_backup_restore + vars: + cifmw_backup_restore_action: restore + cifmw_backup_restore_backup_timestamp: "{{ _cifmw_backup_restore_e2e_restore_timestamp }}" + cifmw_backup_restore_auto_ack: true + when: cifmw_backup_restore_run_restore | bool + +# ======================================== +# Step 6: Post-restore workload validation +# ======================================== +- name: Validate test workload after restore + when: + - cifmw_backup_restore_create_workload | bool + - cifmw_backup_restore_run_restore | bool + vars: + _os_exec: >- + oc exec -t openstackclient -n {{ cifmw_backup_restore_namespace }} -- + block: + - name: Verify compute services are up + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack compute service list -f json | \ + jq -r '.[] | "\(.Binary) \(.Host) \(.State)"' + register: _compute_services + changed_when: false + + - name: Display compute services + ansible.builtin.debug: + msg: "{{ _compute_services.stdout_lines }}" + + - name: Verify network agents are up + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack network agent list -f json | \ + jq -r '.[] | "\(.["Agent Type"]) \(.Host) \(.Alive)"' + register: _network_agents + changed_when: false + + - name: Display network agents + ansible.builtin.debug: + msg: "{{ _network_agents.stdout_lines }}" + + - name: Get instance info + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack server list -f json | \ + jq -r '.[0] | "\(.Name) \(.Status) \(.Networks)"' + register: _instance_info + changed_when: false + + - name: Display instance info + ansible.builtin.debug: + msg: "Instance: {{ _instance_info.stdout }}" + + - name: Get floating IP of test instance + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack server list -f json | \ + jq -r '.[0].Networks' | grep -oP '[\d.]+' | tail -1 + register: _instance_fip + changed_when: false + + - name: Ping floating IP + ansible.builtin.shell: | + ping -c 3 -W 5 {{ _instance_fip.stdout }} + register: _ping_result + changed_when: false + retries: 6 + delay: 10 + until: _ping_result.rc == 0 + + - name: Display ping result + ansible.builtin.debug: + msg: "Ping to {{ _instance_fip.stdout }}: SUCCESS" + + - name: Stop test instance + ansible.builtin.shell: | + set -o pipefail + INSTANCE=$({{ _os_exec }} openstack server list -f json | jq -r '.[0].Name') + {{ _os_exec }} openstack server stop ${INSTANCE} + changed_when: true + + - name: Wait for instance to stop + ansible.builtin.shell: | + set -o pipefail + INSTANCE=$({{ _os_exec }} openstack server list -f json | jq -r '.[0].Name') + {{ _os_exec }} openstack server show ${INSTANCE} -f json | jq -r '.status' + register: _stop_status + changed_when: false + retries: 12 + delay: 5 + until: _stop_status.stdout == 'SHUTOFF' + + - name: Start test instance + ansible.builtin.shell: | + set -o pipefail + INSTANCE=$({{ _os_exec }} openstack server list -f json | jq -r '.[0].Name') + {{ _os_exec }} openstack server start ${INSTANCE} + changed_when: true + + - name: Wait for instance to become active + ansible.builtin.shell: | + set -o pipefail + INSTANCE=$({{ _os_exec }} openstack server list -f json | jq -r '.[0].Name') + {{ _os_exec }} openstack server show ${INSTANCE} -f json | jq -r '.status' + register: _start_status + changed_when: false + retries: 12 + delay: 5 + until: _start_status.stdout == 'ACTIVE' + + - name: Ping floating IP after stop/start + ansible.builtin.shell: | + ping -c 3 -W 5 {{ _instance_fip.stdout }} + register: _ping_after_restart + changed_when: false + retries: 6 + delay: 10 + until: _ping_after_restart.rc == 0 + + - name: Post-restore workload validation passed + ansible.builtin.debug: + msg: >- + Workload validation passed: instance reachable via FIP {{ _instance_fip.stdout }}, + stop/start successful, ping after restart OK + +# ======================================== +# Step 7: Post-restore tempest validation +# ======================================== +- name: Run post-restore tempest validation + ansible.builtin.include_role: + name: "{{ cifmw_run_test_role | default('test_operator') }}" + when: cifmw_backup_restore_run_post_tempest | bool + +- name: Run post_backup_restore hooks + vars: + step: post_backup_restore + ansible.builtin.import_role: + name: run_hook diff --git a/roles/cifmw_backup_restore/tasks/main.yml b/roles/cifmw_backup_restore/tasks/main.yml new file mode 100644 index 000000000..0bab11749 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/main.yml @@ -0,0 +1,25 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +- name: Validate action parameter + ansible.builtin.fail: + msg: >- + cifmw_backup_restore_action must be set to one of: backup, restore, cleanup. + Example: -e cifmw_backup_restore_action=backup + when: cifmw_backup_restore_action not in ['backup', 'restore', 'cleanup'] + +- name: Run {{ cifmw_backup_restore_action }} + ansible.builtin.include_tasks: "{{ cifmw_backup_restore_action }}.yml" diff --git a/roles/cifmw_backup_restore/tasks/ovn_db_backup.yml b/roles/cifmw_backup_restore/tasks/ovn_db_backup.yml new file mode 100644 index 000000000..16a75c408 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/ovn_db_backup.yml @@ -0,0 +1,83 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# OVN Northbound and Southbound DB backups (see dev-docs backup-restore user-guide). +# Backup files are written to OVN PVCs and included in the subsequent OADP PVC backup. + +- name: "Step 2: OVN database backup (label PVCs and ovsdb-client backup)" + ansible.builtin.debug: + msg: + - "========================================" + - "Step 2: OVN Database Backup" + - "========================================" + +- name: Wait for OVN NB database pod + ansible.builtin.command: + cmd: >- + oc wait pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-nb + --for=condition=Ready + --timeout={{ cifmw_backup_restore_ovn_db_ready_timeout }} + changed_when: false + +- name: Wait for OVN SB database pod + ansible.builtin.command: + cmd: >- + oc wait pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-sb + --for=condition=Ready + --timeout={{ cifmw_backup_restore_ovn_db_ready_timeout }} + changed_when: false + +- name: Label OVN NB PVCs for backup and restore + ansible.builtin.command: + cmd: >- + oc label pvc -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-nb + backup.openstack.org/backup=true + backup.openstack.org/restore=true + backup.openstack.org/restore-order=00 + backup.openstack.org/category=controlplane + --overwrite + changed_when: true + +- name: Label OVN SB PVCs for backup and restore + ansible.builtin.command: + cmd: >- + oc label pvc -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-sb + backup.openstack.org/backup=true + backup.openstack.org/restore=true + backup.openstack.org/restore-order=00 + backup.openstack.org/category=controlplane + --overwrite + changed_when: true + +- name: Backup OVN Northbound database to PVC + ansible.builtin.shell: | + set -o pipefail + oc exec ovsdbserver-nb-0 -n {{ cifmw_backup_restore_namespace }} -c ovsdbserver-nb -- \ + bash -c "ovsdb-client backup unix:/etc/ovn/ovnnb_db.sock OVN_Northbound \ + > /etc/ovn/ovnnb_db.db.{{ cifmw_backup_restore_backup_name_suffix }}" + changed_when: true + +- name: Backup OVN Southbound database to PVC + ansible.builtin.shell: | + set -o pipefail + oc exec ovsdbserver-sb-0 -n {{ cifmw_backup_restore_namespace }} -c ovsdbserver-sb -- \ + bash -c "ovsdb-client backup unix:/etc/ovn/ovnsb_db.sock OVN_Southbound \ + > /etc/ovn/ovnsb_db.db.{{ cifmw_backup_restore_backup_name_suffix }}" + changed_when: true diff --git a/roles/cifmw_backup_restore/tasks/ovn_db_restore.yml b/roles/cifmw_backup_restore/tasks/ovn_db_restore.yml new file mode 100644 index 000000000..c6ba801f4 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/ovn_db_restore.yml @@ -0,0 +1,124 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Restore OVN NB/SB databases from timestamped files on the PVCs (user-guide Step 8). +# Must run after Galera restore and before removing deployment-stage. If no backup +# files exist for this timestamp, skip (Neutron–OVN sync at the end repopulates OVN). + +- name: Check for OVN NB backup file from this backup run + ansible.builtin.shell: | + set -o pipefail + oc exec ovsdbserver-nb-0 -n {{ cifmw_backup_restore_namespace }} -c ovsdbserver-nb -- \ + test -f /etc/ovn/ovnnb_db.db.{{ cifmw_backup_restore_backup_timestamp }} + register: _ovn_nb_backup_file + changed_when: false + failed_when: false + when: cifmw_backup_restore_ovn_db | bool + +- name: Set fact — OVN backup files not used (OVN DB steps disabled) + ansible.builtin.set_fact: + _cifmw_backup_restore_ovn_files_present: false + when: not (cifmw_backup_restore_ovn_db | bool) + +- name: Set fact — OVN backup files present on PVC + ansible.builtin.set_fact: + _cifmw_backup_restore_ovn_files_present: "{{ _ovn_nb_backup_file.rc == 0 }}" + when: cifmw_backup_restore_ovn_db | bool + +- name: Skip OVN database file restore (no per-timestamp backup on PVC) + ansible.builtin.debug: + msg: >- + No OVN backup file ovnnb_db.db.{{ cifmw_backup_restore_backup_timestamp }} on ovsdbserver-nb-0; + skipping file restore. OVN will be repopulated via neutron-ovn-db-sync after EDPM. + when: + - cifmw_backup_restore_ovn_db | bool + - not _cifmw_backup_restore_ovn_files_present | bool + +- name: "Step 8: Restore OVN databases from PVC backup files" + when: + - cifmw_backup_restore_ovn_db | bool + - _cifmw_backup_restore_ovn_files_present | bool + block: + - name: Announce OVN database file restore + ansible.builtin.debug: + msg: + - "========================================" + - "Step 8: OVN Database Restore (NB/SB)" + - "========================================" + + - name: Replace OVN DB files and clear follower replicas + ansible.builtin.shell: | + set -euo pipefail + NS="{{ cifmw_backup_restore_namespace }}" + TS="{{ cifmw_backup_restore_backup_timestamp }}" + for db in nb sb; do + oc exec ovsdbserver-${db}-0 -n "${NS}" -c ovsdbserver-${db} -- bash -c \ + "rm -f /etc/ovn/ovn${db}_db.db && \ + cp /etc/ovn/ovn${db}_db.db.${TS} /etc/ovn/ovn${db}_db.db" + COUNT=$(oc get pods -n "${NS}" -l service=ovsdbserver-${db} --no-headers 2>/dev/null | wc -l | awk '{print $1}') + for ((i=1; i/dev/null; then + oc exec "ovsdbserver-${db}-${i}" -n "${NS}" -c ovsdbserver-${db} -- \ + rm -f "/etc/ovn/ovn${db}_db.db" + fi + done + done + changed_when: true + + - name: Force delete OVN NB database pods + ansible.builtin.command: + cmd: >- + oc delete pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-nb + --force + --grace-period=0 + changed_when: true + + - name: Force delete OVN SB database pods + ansible.builtin.command: + cmd: >- + oc delete pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-sb + --force + --grace-period=0 + changed_when: true + + - name: Wait for OVN NB database pods ready + ansible.builtin.command: + cmd: >- + oc wait pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-nb + --for=condition=Ready + --timeout={{ cifmw_backup_restore_ovn_db_ready_timeout }} + changed_when: false + + - name: Wait for OVN SB database pods ready + ansible.builtin.command: + cmd: >- + oc wait pod -n {{ cifmw_backup_restore_namespace }} + -l service=ovsdbserver-sb + --for=condition=Ready + --timeout={{ cifmw_backup_restore_ovn_db_ready_timeout }} + changed_when: false + + - name: Restart OVN control plane pods to reconnect to restored databases + ansible.builtin.shell: | + set -o pipefail + oc delete pod -n {{ cifmw_backup_restore_namespace }} -l service=ovn-northd --ignore-not-found=true + oc delete pod -n {{ cifmw_backup_restore_namespace }} -l service=ovn-controller --ignore-not-found=true + oc delete pod -n {{ cifmw_backup_restore_namespace }} -l service=ovn-controller-ovs --ignore-not-found=true + oc delete pod -n {{ cifmw_backup_restore_namespace }} -l service=ovn-controller-metrics --ignore-not-found=true + changed_when: true diff --git a/roles/cifmw_backup_restore/tasks/restore.yml b/roles/cifmw_backup_restore/tasks/restore.yml new file mode 100644 index 000000000..eae150397 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/restore.yml @@ -0,0 +1,535 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# OpenStack Full Restore +# +# Restores an OpenStack control plane from OADP backups using ordered +# Velero Restore CRs, Galera restore, optional OVN NB/SB file restore, +# then staged ControlPlane resume, dataplane, EDPM, and Neutron–OVN sync. + +- name: Validate backup_timestamp parameter + ansible.builtin.fail: + msg: "cifmw_backup_restore_backup_timestamp is required for restore." + when: cifmw_backup_restore_backup_timestamp is not defined + +- name: Derive backup names from timestamp + ansible.builtin.set_fact: + _pvc_backup_name: "openstack-backup-pvcs-{{ cifmw_backup_restore_backup_timestamp }}" + _resources_backup_name: "openstack-backup-resources-{{ cifmw_backup_restore_backup_timestamp }}" + _restore_suffix: "{{ lookup('pipe', 'date +%Y%m%d-%H%M%S') }}" + +- name: Create temp directory for rendered templates + ansible.builtin.tempfile: + state: directory + prefix: openstack-restore- + register: _cifmw_backup_restore_rendered_dir + +# ======================================== +# Pre-flight checks +# ======================================== +- name: Verify OADP operator is installed + ansible.builtin.shell: | + oc get crd backups.velero.io -o name + register: _oadp_crd_check + changed_when: false + failed_when: false + +- name: Fail if OADP is not installed + ansible.builtin.fail: + msg: "OADP operator is not installed (Velero Backup CRD not found)." + when: _oadp_crd_check.rc != 0 + +- name: Ensure target namespace exists + kubernetes.core.k8s: + api_version: v1 + kind: Namespace + name: "{{ cifmw_backup_restore_namespace }}" + state: present + +- name: Verify OADP backups exist + ansible.builtin.shell: | + oc get backup {{ _pvc_backup_name }} -n {{ cifmw_backup_restore_oadp_namespace }} -o jsonpath='{.status.phase}' + register: _pvc_backup_phase + changed_when: false + +- name: Verify resources backup exists + ansible.builtin.shell: | + oc get backup {{ _resources_backup_name }} -n {{ cifmw_backup_restore_oadp_namespace }} -o jsonpath='{.status.phase}' + register: _resources_backup_phase + changed_when: false + +- name: Fail if backups are not completed + ansible.builtin.fail: + msg: "Backups must be Completed. PVC: {{ _pvc_backup_phase.stdout }}, Resources: {{ _resources_backup_phase.stdout }}" + when: _pvc_backup_phase.stdout != "Completed" or _resources_backup_phase.stdout != "Completed" + +# ======================================== +# Operator version validation +# ======================================== +- name: Get required operator version from backup + ansible.builtin.shell: | + oc get backup {{ _resources_backup_name }} -n {{ cifmw_backup_restore_oadp_namespace }} \ + -o jsonpath='{.metadata.annotations.openstack\.org/csv-version}' + register: _backup_csv_version + changed_when: false + failed_when: false + +- name: Get installed operator version + ansible.builtin.shell: | + oc get csv -n openstack-operators \ + -l operators.coreos.com/openstack-operator.openstack-operators \ + -o jsonpath='{.items[0].metadata.name}' + register: _installed_csv_version + changed_when: false + failed_when: false + +- name: Display operator version comparison + ansible.builtin.debug: + msg: + - "Backup operator version: {{ _backup_csv_version.stdout | default('not recorded') }}" + - "Installed operator version: {{ _installed_csv_version.stdout | default('not installed') }}" + +- name: Warn if operator version does not match backup + ansible.builtin.fail: + msg: | + Operator version mismatch! + Backup requires: {{ _backup_csv_version.stdout }} + Installed version: {{ _installed_csv_version.stdout | default('not installed') }} + Install the correct operator version before restoring. + Catalog source image from backup: + oc get backup {{ _resources_backup_name }} -n {{ cifmw_backup_restore_oadp_namespace }} \ + -o jsonpath='{.metadata.annotations.openstack\.org/catalog-source-image}' + when: + - _backup_csv_version.stdout | default('') != '' + - _installed_csv_version.stdout | default('') != _backup_csv_version.stdout + +# ======================================== +# Resource modifier ConfigMap +# ======================================== +- name: Render resource modifier ConfigMap + ansible.builtin.template: + src: 00-resource-modifiers-configmap.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/00-resource-modifiers-configmap.yaml" + mode: "0644" + +- name: Create resource modifier ConfigMap + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/00-resource-modifiers-configmap.yaml" + state: present + +# ======================================== +# Step 0.5: Pin PVCs to nodes (optional) +# ======================================== +- name: Pin PVCs to original nodes + ansible.builtin.include_tasks: restore_pin_pvcs.yml + when: cifmw_backup_restore_pin_pvcs | bool + +# ======================================== +# Step 1: Restore PVCs (Order 00) +# ======================================== +- name: Render PVC restore + ansible.builtin.template: + src: 01-restore-order-00-pvcs.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/01-restore-order-00-pvcs.yaml" + mode: "0644" + vars: + pvc_backup_name: "{{ _pvc_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Pause before PVC restore + ansible.builtin.pause: + prompt: "Press Enter to restore PVCs, or Ctrl+C then 'A' to abort" + when: not (cifmw_backup_restore_auto_ack | bool) + +- name: Create PVC restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/01-restore-order-00-pvcs.yaml" + state: present + +- name: Wait for PVC restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-00-pvcs-{{ _restore_suffix }}" + _step_name: "Step 1 (PVC restore)" + +- name: Get PVC-pin dummy Deployments + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: Deployment + namespace: "{{ cifmw_backup_restore_namespace }}" + label_selectors: + - app=pvc-pin + register: _pvc_pin_deployments + when: cifmw_backup_restore_pin_pvcs | bool + +- name: Delete dummy Deployments after PVC restore + kubernetes.core.k8s: + api_version: apps/v1 + kind: Deployment + namespace: "{{ cifmw_backup_restore_namespace }}" + name: "{{ item.metadata.name }}" + state: absent + loop: "{{ _pvc_pin_deployments.resources | default([]) }}" + loop_control: + label: "{{ item.metadata.name }}" + when: cifmw_backup_restore_pin_pvcs | bool + +# ======================================== +# Step 2: Restore Foundation (Order 10) +# ======================================== +- name: Render foundation restore + ansible.builtin.template: + src: 02-restore-order-10-foundation.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/02-restore-order-10-foundation.yaml" + mode: "0644" + vars: + resources_backup_name: "{{ _resources_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Create foundation restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/02-restore-order-10-foundation.yaml" + state: present + +- name: Wait for foundation restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-10-foundation-{{ _restore_suffix }}" + _step_name: "Step 2 (Foundation restore)" + +# ======================================== +# Step 3: Restore Infrastructure (Order 20) +# ======================================== +- name: Render infrastructure restore + ansible.builtin.template: + src: 03-restore-order-20-infrastructure.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/03-restore-order-20-infrastructure.yaml" + mode: "0644" + vars: + resources_backup_name: "{{ _resources_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Create infrastructure restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/03-restore-order-20-infrastructure.yaml" + state: present + +- name: Wait for infrastructure restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-20-infra-{{ _restore_suffix }}" + _step_name: "Step 3 (Infrastructure restore)" + +# ======================================== +# Step 4: Restore ControlPlane (Order 30) +# ======================================== +- name: Render controlplane restore + ansible.builtin.template: + src: 04-restore-order-30-controlplane.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/04-restore-order-30-controlplane.yaml" + mode: "0644" + vars: + resources_backup_name: "{{ _resources_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Create controlplane restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/04-restore-order-30-controlplane.yaml" + state: present + +- name: Wait for controlplane restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-30-ctlplane-{{ _restore_suffix }}" + _step_name: "Step 4 (ControlPlane restore)" + +# ======================================== +# Step 5: Wait for Infrastructure Ready +# ======================================== +- name: Wait for OpenStackControlPlane infrastructure ready + ansible.builtin.shell: | + oc wait openstackcontrolplane -n {{ cifmw_backup_restore_namespace }} --all \ + --for=condition=OpenStackControlPlaneInfrastructureReady \ + --timeout={{ cifmw_backup_restore_infra_ready_timeout }} + changed_when: false + +- name: Get OpenStackControlPlane name + ansible.builtin.shell: | + oc get openstackcontrolplane -n {{ cifmw_backup_restore_namespace }} -o jsonpath='{.items[0].metadata.name}' + register: _ctlplane_name + changed_when: false + +# ======================================== +# Step 6: Restore GaleraBackup CRs (Order 40) +# ======================================== +- name: Render backup config restore + ansible.builtin.template: + src: 05-restore-order-40-backup-config.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/05-restore-order-40-backup-config.yaml" + mode: "0644" + vars: + resources_backup_name: "{{ _resources_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Create backup config restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/05-restore-order-40-backup-config.yaml" + state: present + +- name: Wait for backup config restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-40-backup-{{ _restore_suffix }}" + _step_name: "Step 6 (GaleraBackup restore)" + +# ======================================== +# Step 7: Database Restore +# ======================================== +- name: Get GaleraBackup names + ansible.builtin.shell: | + oc get galerabackup -n {{ cifmw_backup_restore_namespace }} -o jsonpath='{.items[*].metadata.name}' + register: _galerabackup_names + changed_when: false + +- name: Set GaleraBackup list + ansible.builtin.set_fact: + _galerabackup_list: "{{ _galerabackup_names.stdout.split() if _galerabackup_names.stdout != '' else [] }}" + +- name: Render GaleraRestore CRs + ansible.builtin.template: + src: 06a-galerarestore.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/06a-galerarestore.yaml" + mode: "0644" + vars: + galerabackup_list: "{{ _galerabackup_list }}" + when: _galerabackup_list | length > 0 + +- name: Apply GaleraRestore CRs + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/06a-galerarestore.yaml" + state: present + when: _galerabackup_list | length > 0 + +- name: Wait for GaleraRestore pods to be ready + ansible.builtin.shell: | + RESTORE_NAME="{{ item }}restore" + BACKUP_SOURCE="{{ item }}" + POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}" + oc wait --for=condition=Ready pod/${POD_NAME} -n {{ cifmw_backup_restore_namespace }} --timeout=120s + loop: "{{ _galerabackup_list }}" + changed_when: false + when: _galerabackup_list | length > 0 + +- name: Execute database restore for each GaleraRestore + ansible.builtin.shell: | + RESTORE_NAME="{{ item }}restore" + BACKUP_SOURCE="{{ item }}" + POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}" + TIMESTAMP="{{ cifmw_backup_restore_backup_timestamp }}" + RESTORE_PATTERN="/backup/data/*_${TIMESTAMP}.sql.gz" + oc exec -n {{ cifmw_backup_restore_namespace }} ${POD_NAME} -- \ + /var/lib/backup-scripts/restore_galera --yes --content {{ cifmw_backup_restore_restore_content }} ${RESTORE_PATTERN} + loop: "{{ _galerabackup_list }}" + changed_when: true + when: _galerabackup_list | length > 0 + +- name: List GaleraRestore CRs kept for validation + ansible.builtin.debug: + msg: "GaleraRestore CR '{{ item }}restore' kept for post-restore validation (cleaned up by cleanup step)" + loop: "{{ _galerabackup_list }}" + when: _galerabackup_list | length > 0 + +# ======================================== +# Step 8: OVN database restore (optional; before full control plane resume) +# ======================================== +- name: Include OVN database restore tasks + ansible.builtin.include_tasks: ovn_db_restore.yml + +# RabbitMQ credentials are restored automatically: +# The infra-operator's RabbitMQ controller labels the default-user secret +# for restore. On restore, the secret is restored in order 10, and the +# controller reuses the existing credentials when creating the new cluster. + +# ======================================== +# Step 9: Resume Full Deployment +# ======================================== +- name: Pause before resuming deployment + ansible.builtin.pause: + prompt: "Press Enter to resume full deployment, or Ctrl+C then 'A' to abort" + when: not (cifmw_backup_restore_auto_ack | bool) + +- name: Remove deployment-stage annotation + ansible.builtin.shell: | + oc annotate openstackcontrolplane {{ _ctlplane_name.stdout }} \ + -n {{ cifmw_backup_restore_namespace }} core.openstack.org/deployment-stage- + changed_when: true + +- name: Wait for OpenStackControlPlane to be ready + ansible.builtin.shell: | + oc wait openstackcontrolplane {{ _ctlplane_name.stdout }} \ + -n {{ cifmw_backup_restore_namespace }} \ + --for=condition=Ready \ + --timeout={{ cifmw_backup_restore_ctlplane_ready_timeout }} + changed_when: false + +# ======================================== +# Step 10: Restore DataPlane (Order 60) +# ======================================== +- name: Render dataplane restore + ansible.builtin.template: + src: 07-restore-order-60-dataplane.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/07-restore-order-60-dataplane.yaml" + mode: "0644" + vars: + resources_backup_name: "{{ _resources_backup_name }}" + restore_suffix: "{{ _restore_suffix }}" + +- name: Create dataplane restore + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/07-restore-order-60-dataplane.yaml" + state: present + +- name: Wait for dataplane restore + ansible.builtin.include_tasks: wait_for_restore.yml + vars: + _restore_name: "openstack-restore-60-dataplane-{{ _restore_suffix }}" + _step_name: "Step 10 (DataPlane restore)" + +# ======================================== +# Step 11: EDPM Deployment +# ======================================== +- name: Get DataPlaneNodeSet names + ansible.builtin.shell: | + oc get openstackdataplanenodeset -n {{ cifmw_backup_restore_namespace }} -o jsonpath='{.items[*].metadata.name}' + register: _nodeset_names + changed_when: false + +- name: Set nodeset list + ansible.builtin.set_fact: + _nodeset_names_list: "{{ _nodeset_names.stdout.split() }}" + when: _nodeset_names.stdout != "" + +- name: Render EDPM deployment CR + ansible.builtin.template: + src: 08-edpm-deployment.yaml.j2 + dest: "{{ _cifmw_backup_restore_rendered_dir.path }}/08-edpm-deployment.yaml" + mode: "0644" + vars: + nodeset_names_list: "{{ _nodeset_names_list }}" + restore_suffix: "{{ _restore_suffix }}" + when: _nodeset_names.stdout != "" + +- name: Create EDPM deployment + kubernetes.core.k8s: + src: "{{ _cifmw_backup_restore_rendered_dir.path }}/08-edpm-deployment.yaml" + state: present + when: _nodeset_names.stdout != "" + +- name: Print EDPM deployment status + ansible.builtin.debug: + msg: "Created EDPM deployment: edpm-deployment-post-restore-{{ _restore_suffix }}" + when: _nodeset_names.stdout != "" + +- name: Wait for EDPM deployment to complete + ansible.builtin.command: + cmd: >- + oc wait OpenStackDataPlaneDeployment + edpm-deployment-post-restore-{{ _restore_suffix }} + --namespace={{ cifmw_backup_restore_namespace }} + --for=condition=Ready + --timeout={{ cifmw_backup_restore_edpm_deploy_timeout }} + when: _nodeset_names.stdout != "" + +# ======================================== +# Step 12: Verify and sync Neutron to OVN (user-guide backup-restore Step 12) +# ======================================== +# Run after EDPM so compute ovn-controller agents reconnect to the SB DB first. +# Log mode does not change exit code for drift (Neutron logs drift as WARNING lines). +# Repair runs if OVN file backup was skipped, or if log output contains WARNING drift. +# Ref: https://github.com/openstack-k8s-operators/dev-docs/blob/main/backup-restore/user-guide.md#step-12-verify-and-sync-neutron-to-ovn +- name: Verify Neutron vs OVN (neutron-ovn-db-sync-util log mode) + ansible.builtin.shell: | + set -o pipefail + oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ + neutron-ovn-db-sync-util \ + --config-file /usr/share/neutron/neutron-dist.conf \ + --config-file /etc/neutron/neutron.conf \ + --config-dir /etc/neutron/neutron.conf.d \ + --ovn-neutron_sync_mode=log \ + --debug + register: _neutron_ovn_sync_log + changed_when: false + +- name: Decide whether Neutron–OVN repair sync is required + ansible.builtin.set_fact: + _neutron_ovn_needs_repair: >- + {{ + (not (cifmw_backup_restore_ovn_db | bool)) + or ( + (cifmw_backup_restore_ovn_db | bool) + and ( + ((_neutron_ovn_sync_log.stdout | default('')) ~ (_neutron_ovn_sync_log.stderr | default(''))) + | regex_search('(?i)\bWARNING\b') is not none + ) + ) + }} + +- name: Report Neutron–OVN repair decision + ansible.builtin.debug: + msg: >- + neutron-ovn-db-sync repair: + {{ 'running' if _neutron_ovn_needs_repair | bool else 'skipped' }}. + {% if not (cifmw_backup_restore_ovn_db | bool) %} + Reason: cifmw_backup_restore_ovn_db is false (no OVN NB/SB file backup; OVN must be repopulated from Neutron). + {% elif _neutron_ovn_needs_repair | bool %} + Reason: log-mode output contained WARNING lines (Neutron-reported drift vs OVN). + {% else %} + Reason: OVN file backup/restore was used and log-mode output had no WARNING lines. + {% endif %} + +- name: Sync Neutron state to OVN database (repair mode) + ansible.builtin.shell: | + set -o pipefail + oc exec -n {{ cifmw_backup_restore_namespace }} -c neutron-api deploy/neutron -- \ + neutron-ovn-db-sync-util \ + --config-file /usr/share/neutron/neutron-dist.conf \ + --config-file /etc/neutron/neutron.conf \ + --config-dir /etc/neutron/neutron.conf.d \ + --ovn-neutron_sync_mode=repair \ + --debug + when: _neutron_ovn_needs_repair | bool + changed_when: true + +# ======================================== +# Cleanup and Summary +# ======================================== +- name: Clean up rendered templates + ansible.builtin.file: + path: "{{ _cifmw_backup_restore_rendered_dir.path }}" + state: absent + +- name: Print restore summary + ansible.builtin.debug: + msg: + - "========================================" + - "Restore Complete" + - "========================================" + - "" + - "ControlPlane: {{ _ctlplane_name.stdout }}" + - "Restore suffix: {{ _restore_suffix }}" + - "" + - "IMPORTANT: Re-enable InstanceHa after verifying the cloud:" + - " oc patch instanceha -n {{ cifmw_backup_restore_namespace }} --type merge -p '{\"spec\":{\"disabled\":\"False\"}}'" diff --git a/roles/cifmw_backup_restore/tasks/restore_pin_pvcs.yml b/roles/cifmw_backup_restore/tasks/restore_pin_pvcs.yml new file mode 100644 index 000000000..d277c4ee0 --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/restore_pin_pvcs.yml @@ -0,0 +1,114 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Pin PVCs to Original Nodes +# +# For Data Mover restores with WaitForFirstConsumer storage (e.g., LVM/TopoLVM), +# creates dummy Deployments with nodeSelector to bind restored PVCs to their +# original nodes. Downloads backup metadata to extract PVC-to-node mapping +# from PV nodeAffinity. + +- name: Download backup metadata + ansible.builtin.shell: | + set -e + TMPDIR=$(mktemp -d) + REMOTE_FILE="/tmp/backup-$$.tar.gz" + VELERO_POD=$(oc get pods -n {{ cifmw_backup_restore_oadp_namespace }} -l deploy=velero --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}') + oc exec -n {{ cifmw_backup_restore_oadp_namespace }} ${VELERO_POD} -- rm -f ${REMOTE_FILE} >&2 || true + oc exec -n {{ cifmw_backup_restore_oadp_namespace }} ${VELERO_POD} -- \ + /velero backup download {{ _pvc_backup_name }} --insecure-skip-tls-verify -o ${REMOTE_FILE} >&2 + oc exec -n {{ cifmw_backup_restore_oadp_namespace }} ${VELERO_POD} -- cat ${REMOTE_FILE} > ${TMPDIR}/backup.tar.gz + oc exec -n {{ cifmw_backup_restore_oadp_namespace }} ${VELERO_POD} -- rm -f ${REMOTE_FILE} >&2 + mkdir -p ${TMPDIR}/backup + tar xzf ${TMPDIR}/backup.tar.gz -C ${TMPDIR}/backup + echo ${TMPDIR} + register: _backup_download + changed_when: false + +- name: Extract PVC-to-node mapping from PV nodeAffinity + ansible.builtin.shell: | + set -o pipefail + BACKUP_DIR="{{ _backup_download.stdout }}/backup" + PV_DIR="${BACKUP_DIR}/resources/persistentvolumes/cluster" + if [ ! -d "${PV_DIR}" ]; then + echo "WARNING: No PV resources found in backup" >&2 + exit 0 + fi + for f in ${PV_DIR}/*.json; do + pvc=$(jq -r '.spec.claimRef.name // empty' "$f") + ns=$(jq -r '.spec.claimRef.namespace // empty' "$f") + node=$(jq -r ' + .spec.nodeAffinity.required.nodeSelectorTerms[0].matchExpressions[] + | select(.key | contains("topolvm")) | .values[0] + ' "$f" 2>/dev/null) + [ -n "$pvc" ] && [ -n "$node" ] && [ "$ns" = "{{ cifmw_backup_restore_namespace }}" ] && echo "${pvc}:${node}" + done + register: _pvc_node_mapping + changed_when: false + +- name: Print PVC-to-node mapping + ansible.builtin.debug: + msg: "{{ _pvc_node_mapping.stdout_lines }}" + when: _pvc_node_mapping.stdout != "" + +- name: Create dummy Deployments to pin PVCs to nodes + ansible.builtin.shell: | + set -o pipefail + PVC_NAME="{{ item.split(':')[0] }}" + NODE_NAME="{{ item.split(':')[1] }}" + cat <- + oc wait galerabackup --all -n {{ cifmw_backup_restore_namespace }} + --for=jsonpath='{.status.conditions[0].status}'=True --timeout=120s + changed_when: false + +- name: Wait for GaleraBackup cronjobs to be created + ansible.builtin.shell: | + oc get cronjob -n {{ cifmw_backup_restore_namespace }} -l app=galera -o jsonpath='{.items[*].metadata.name}' + register: _galera_backup_cronjobs + changed_when: false + until: _galera_backup_cronjobs.stdout.split() | length >= _galera_instances | length + retries: 30 + delay: 10 + +- name: Cleanup rendered templates + ansible.builtin.file: + path: "{{ _galerabackup_rendered_dir.path }}" + state: absent + +- name: Display GaleraBackup setup complete + ansible.builtin.debug: + msg: "GaleraBackup CRs created for: {{ _galera_instances | join(', ') }}" diff --git a/roles/cifmw_backup_restore/tasks/wait_for_restore.yml b/roles/cifmw_backup_restore/tasks/wait_for_restore.yml new file mode 100644 index 000000000..956ccec7b --- /dev/null +++ b/roles/cifmw_backup_restore/tasks/wait_for_restore.yml @@ -0,0 +1,79 @@ +--- +# Copyright Red Hat, Inc. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# Reusable task to wait for a Velero Restore to complete. +# +# Required variables: +# _restore_name: Name of the Velero Restore CR +# _step_name: Human-readable step name for log messages + +- name: "Wait for restore to complete - {{ _step_name }}" + ansible.builtin.command: + cmd: >- + oc get restore {{ _restore_name }} + -n {{ cifmw_backup_restore_oadp_namespace }} + -o jsonpath='{.status.phase}' + register: _restore_phase_raw + changed_when: false + until: _restore_phase_raw.stdout in ["Completed", "PartiallyFailed", "Failed", "FailedValidation"] + retries: "{{ (cifmw_backup_restore_restore_timeout | int / 10) | int }}" + delay: 10 + +- name: "Set restore phase fact - {{ _step_name }}" + ansible.builtin.set_fact: + _restore_phase: + stdout: "{{ _restore_phase_raw.stdout }}" + +- name: "Get restore details on non-Completed phase - {{ _step_name }}" + ansible.builtin.shell: | + set -o pipefail + echo "=== Restore Status ===" + oc get restore {{ _restore_name }} -n {{ cifmw_backup_restore_oadp_namespace }} \ + -o jsonpath='{.status}' | python3 -m json.tool 2>/dev/null || \ + oc get restore {{ _restore_name }} -n {{ cifmw_backup_restore_oadp_namespace }} \ + -o yaml + echo "" + echo "=== Velero Restore Logs ===" + oc -n {{ cifmw_backup_restore_oadp_namespace }} exec deployment/velero -- \ + ./velero restore logs {{ _restore_name }} 2>/dev/null | tail -50 || true + register: _restore_details + changed_when: false + when: _restore_phase.stdout != "Completed" + +- name: "Print restore diagnostics - {{ _step_name }}" + ansible.builtin.debug: + msg: "{{ _restore_details.stdout_lines }}" + when: _restore_phase.stdout != "Completed" + +- name: "Fail on Failed restore - {{ _step_name }}" + ansible.builtin.fail: + msg: "Restore {{ _restore_name }} {{ _restore_phase.stdout }}." + when: _restore_phase.stdout in ["Failed", "FailedValidation"] + +- name: "Fail on PartiallyFailed restore (strict mode) - {{ _step_name }}" + ansible.builtin.fail: + msg: "Restore {{ _restore_name }} {{ _restore_phase.stdout }}. Re-run with -e cifmw_backup_restore_strict_restore=false to continue." + when: _restore_phase.stdout == "PartiallyFailed" and (cifmw_backup_restore_strict_restore | bool) + +- name: "Warn on PartiallyFailed restore (non-strict mode) - {{ _step_name }}" + ansible.builtin.debug: + msg: "WARNING: Restore {{ _restore_name }} completed with status: {{ _restore_phase.stdout }}" + when: _restore_phase.stdout == "PartiallyFailed" and not (cifmw_backup_restore_strict_restore | bool) + +- name: "Print success - {{ _step_name }}" + ansible.builtin.debug: + msg: "Restore {{ _restore_name }} completed successfully" + when: _restore_phase.stdout == "Completed" diff --git a/roles/cifmw_backup_restore/templates/00-resource-modifiers-configmap.yaml.j2 b/roles/cifmw_backup_restore/templates/00-resource-modifiers-configmap.yaml.j2 new file mode 100644 index 000000000..8df9e3a25 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/00-resource-modifiers-configmap.yaml.j2 @@ -0,0 +1,39 @@ +--- +# Resource Modifier ConfigMap for Velero Restores +apiVersion: v1 +kind: ConfigMap +metadata: + name: openstack-restore-resource-modifiers + namespace: {{ cifmw_backup_restore_oadp_namespace }} +data: + resource-modifiers.yaml: | + version: v1 + resourceModifierRules: + - conditions: + groupResource: "*" + namespaces: + - {{ cifmw_backup_restore_namespace }} + mergePatches: + - patchData: | + metadata: + ownerReferences: null + annotations: + kubectl.kubernetes.io/last-applied-configuration: null + - conditions: + groupResource: openstackcontrolplanes.core.openstack.org + namespaces: + - {{ cifmw_backup_restore_namespace }} + mergePatches: + - patchData: | + metadata: + annotations: + kubectl.kubernetes.io/last-applied-configuration: null + core.openstack.org/deployment-stage: "infrastructure-only" + - conditions: + groupResource: instancehas.instanceha.openstack.org + namespaces: + - {{ cifmw_backup_restore_namespace }} + mergePatches: + - patchData: | + spec: + disabled: "True" diff --git a/roles/cifmw_backup_restore/templates/01-restore-order-00-pvcs.yaml.j2 b/roles/cifmw_backup_restore/templates/01-restore-order-00-pvcs.yaml.j2 new file mode 100644 index 000000000..175eecfc2 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/01-restore-order-00-pvcs.yaml.j2 @@ -0,0 +1,17 @@ +--- +# Restore Order 00: Storage Foundation - PVCs +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-00-pvcs-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ pvc_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + excludedResources: + - pods + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers + restorePVs: true diff --git a/roles/cifmw_backup_restore/templates/02-restore-order-10-foundation.yaml.j2 b/roles/cifmw_backup_restore/templates/02-restore-order-10-foundation.yaml.j2 new file mode 100644 index 000000000..353b8db49 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/02-restore-order-10-foundation.yaml.j2 @@ -0,0 +1,19 @@ +--- +# Restore Order 10: Foundation Resources +# Restores NADs, Secrets, ConfigMaps (user-provided resources without ownerRefs) +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-10-foundation-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ resources_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/restore: "true" + backup.openstack.org/restore-order: "10" + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers diff --git a/roles/cifmw_backup_restore/templates/03-restore-order-20-infrastructure.yaml.j2 b/roles/cifmw_backup_restore/templates/03-restore-order-20-infrastructure.yaml.j2 new file mode 100644 index 000000000..e3d9b97c6 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/03-restore-order-20-infrastructure.yaml.j2 @@ -0,0 +1,20 @@ +--- +# Restore Order 20: Infrastructure CRs +# Restores OpenStackVersion, OpenStackBackupConfig, Issuers, +# NetConfig, Topology, BGPConfiguration, DNSData, InstanceHa +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-20-infra-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ resources_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/restore: "true" + backup.openstack.org/restore-order: "20" + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers diff --git a/roles/cifmw_backup_restore/templates/04-restore-order-30-controlplane.yaml.j2 b/roles/cifmw_backup_restore/templates/04-restore-order-30-controlplane.yaml.j2 new file mode 100644 index 000000000..4faabfcb0 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/04-restore-order-30-controlplane.yaml.j2 @@ -0,0 +1,19 @@ +--- +# Restore Order 30: OpenStackControlPlane (Staged) +# Restores with deployment-stage annotation for infrastructure-only start +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-30-ctlplane-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ resources_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/restore: "true" + backup.openstack.org/restore-order: "30" + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers diff --git a/roles/cifmw_backup_restore/templates/05-restore-order-40-backup-config.yaml.j2 b/roles/cifmw_backup_restore/templates/05-restore-order-40-backup-config.yaml.j2 new file mode 100644 index 000000000..0e9dc6bd7 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/05-restore-order-40-backup-config.yaml.j2 @@ -0,0 +1,18 @@ +--- +# Restore Order 40: Backup Configuration, IP Sets & DataPlane Services +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-40-backup-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ resources_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/restore: "true" + backup.openstack.org/restore-order: "40" + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers diff --git a/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 b/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 new file mode 100644 index 000000000..58ee92a6c --- /dev/null +++ b/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 @@ -0,0 +1,14 @@ +--- +# GaleraRestore CRs for database restore +{% for backup_name in galerabackup_list %} +apiVersion: mariadb.openstack.org/v1beta1 +kind: GaleraRestore +metadata: + name: {{ backup_name }}restore + namespace: {{ cifmw_backup_restore_namespace }} +spec: + backupSource: {{ backup_name }} +{% if not loop.last %} +--- +{% endif %} +{% endfor %} diff --git a/roles/cifmw_backup_restore/templates/07-restore-order-60-dataplane.yaml.j2 b/roles/cifmw_backup_restore/templates/07-restore-order-60-dataplane.yaml.j2 new file mode 100644 index 000000000..9ae0644ac --- /dev/null +++ b/roles/cifmw_backup_restore/templates/07-restore-order-60-dataplane.yaml.j2 @@ -0,0 +1,18 @@ +--- +# Restore Order 60: DataPlane Resources +apiVersion: velero.io/v1 +kind: Restore +metadata: + name: openstack-restore-60-dataplane-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} +spec: + backupName: {{ resources_backup_name }} + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/restore: "true" + backup.openstack.org/restore-order: "60" + resourceModifier: + kind: ConfigMap + name: openstack-restore-resource-modifiers diff --git a/roles/cifmw_backup_restore/templates/08-edpm-deployment.yaml.j2 b/roles/cifmw_backup_restore/templates/08-edpm-deployment.yaml.j2 new file mode 100644 index 000000000..9b727e8d8 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/08-edpm-deployment.yaml.j2 @@ -0,0 +1,12 @@ +--- +# Post-Restore EDPM Deployment +apiVersion: dataplane.openstack.org/v1beta1 +kind: OpenStackDataPlaneDeployment +metadata: + name: edpm-deployment-post-restore-{{ restore_suffix }} + namespace: {{ cifmw_backup_restore_namespace }} +spec: + nodeSets: +{% for nodeset in nodeset_names_list %} + - {{ nodeset }} +{% endfor %} diff --git a/roles/cifmw_backup_restore/templates/backup-pvcs.yaml.j2 b/roles/cifmw_backup_restore/templates/backup-pvcs.yaml.j2 new file mode 100644 index 000000000..0f0058119 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/backup-pvcs.yaml.j2 @@ -0,0 +1,48 @@ +--- +# OpenStack PVC Backup +# Backs up PVCs labeled with backup.openstack.org/backup=true using CSI snapshots. +apiVersion: velero.io/v1 +kind: Backup +metadata: + name: openstack-backup-pvcs-{{ cifmw_backup_restore_backup_name_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} + annotations: + openstack.org/csv-version: "{{ _backup_csv_version }}" + openstack.org/catalog-source-image: "{{ _backup_catalog_image }}" + openstack.org/operator-image: "{{ _backup_operator_image }}" +spec: + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + backup.openstack.org/backup: "true" + snapshotVolumes: true + defaultVolumesToFsBackup: false +{% if cifmw_backup_restore_snapshot_move_data | bool %} + snapshotMoveData: true +{% endif %} + volumeSnapshotLocations: [] + storageLocation: {{ cifmw_backup_restore_storage_location }} + ttl: {{ cifmw_backup_restore_backup_ttl }} + hooks: + resources: + - name: swift-xattr-backup + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + labelSelector: + matchLabels: + component: swift-storage + pre: + - exec: + container: object-server + command: + - /bin/bash + - -c + - | + set -e + DUMP="/srv/node/pv/.swift-xattrs.dump" + rm -f "$DUMP" "${DUMP}.applied" "${DUMP}.missing" + getfattr -R -d -m user.swift /srv/node/pv/ 1> "$DUMP" + echo "xattr backup complete: $(grep -c '^# file:' "$DUMP") files" + onError: Fail + timeout: {{ cifmw_backup_restore_swift_xattr_timeout | default('300s') }} diff --git a/roles/cifmw_backup_restore/templates/backup-resources.yaml.j2 b/roles/cifmw_backup_restore/templates/backup-resources.yaml.j2 new file mode 100644 index 000000000..05f4dc471 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/backup-resources.yaml.j2 @@ -0,0 +1,20 @@ +--- +# OpenStack Resources Backup (excluding PVCs) +# Backs up all resources in the OpenStack namespace except PVCs and PVs. +apiVersion: velero.io/v1 +kind: Backup +metadata: + name: openstack-backup-resources-{{ cifmw_backup_restore_backup_name_suffix }} + namespace: {{ cifmw_backup_restore_oadp_namespace }} + annotations: + openstack.org/csv-version: "{{ _backup_csv_version }}" + openstack.org/catalog-source-image: "{{ _backup_catalog_image }}" + openstack.org/operator-image: "{{ _backup_operator_image }}" +spec: + includedNamespaces: + - {{ cifmw_backup_restore_namespace }} + excludedResources: + - persistentvolumeclaims + - persistentvolumes + storageLocation: {{ cifmw_backup_restore_storage_location }} + ttl: {{ cifmw_backup_restore_backup_ttl }} diff --git a/roles/cifmw_backup_restore/templates/galerabackup.yaml.j2 b/roles/cifmw_backup_restore/templates/galerabackup.yaml.j2 new file mode 100644 index 000000000..88f20c622 --- /dev/null +++ b/roles/cifmw_backup_restore/templates/galerabackup.yaml.j2 @@ -0,0 +1,13 @@ +apiVersion: mariadb.openstack.org/v1beta1 +kind: GaleraBackup +metadata: + name: {{ galera_instance_name }} + namespace: {{ cifmw_backup_restore_namespace }} +spec: + databaseInstance: {{ galera_instance_name }} +{% if _galera_storage_class %} + storageClass: {{ _galera_storage_class }} +{% endif %} + storageRequest: {{ cifmw_backup_restore_galera_storage_request }} + transferStorage: + storageRequest: {{ cifmw_backup_restore_galera_transfer_storage_request }} diff --git a/zuul.d/molecule.yaml b/zuul.d/molecule.yaml index 9f44f4685..903d2e114 100644 --- a/zuul.d/molecule.yaml +++ b/zuul.d/molecule.yaml @@ -902,6 +902,15 @@ - ^.config/molecule/.* name: cifmw-molecule-ci_lvms_storage parent: cifmw-molecule-noop +- job: + files: + - ^common-requirements.txt + - ^test-requirements.txt + - ^roles/cifmw_backup_restore/.* + - ^ci/playbooks/molecule.* + - ^.config/molecule/.* + name: cifmw-molecule-cifmw_backup_restore + parent: cifmw-molecule-noop - job: files: - ^common-requirements.txt diff --git a/zuul.d/projects.yaml b/zuul.d/projects.yaml index 6ea6ca035..9b4b8e84b 100644 --- a/zuul.d/projects.yaml +++ b/zuul.d/projects.yaml @@ -26,6 +26,7 @@ - cifmw-molecule-ci_network - cifmw-molecule-ci_nmstate - cifmw-molecule-ci_setup + - cifmw-molecule-cifmw_backup_restore - cifmw-molecule-cifmw_block_device - cifmw-molecule-cifmw_ceph_client - cifmw-molecule-cifmw_ceph_spec From 80aec0d811dfcd84d86e2b77d2f5bdd3b71fb1bb Mon Sep 17 00:00:00 2001 From: bshewale Date: Thu, 30 Apr 2026 11:56:36 +0530 Subject: [PATCH 19/22] Switch baremetal and end-to-end jobs to 4.20 This PR switched baremetal and end-to-end jobs to OCP- 4.20 Jobs: baremetal and end-to-end Signed-off-by: Bhagyashri Shewale bshewale@redhat.com --- zuul.d/edpm.yaml | 6 +++--- zuul.d/end-to-end.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/zuul.d/edpm.yaml b/zuul.d/edpm.yaml index 80506c82e..9debdedef 100644 --- a/zuul.d/edpm.yaml +++ b/zuul.d/edpm.yaml @@ -13,7 +13,7 @@ # Virtual Baremetal job with CRC and single compute node. - job: name: cifmw-crc-podified-edpm-baremetal - nodeset: centos-9-crc-2-48-0-6xlarge + nodeset: centos-9-crc-2-56-0-6xlarge parent: cifmw-base-crc-openstack run: ci/playbooks/edpm_baremetal_deployment/run.yml irrelevant-files: @@ -32,7 +32,7 @@ # Virtual Baremetal job with CRC and single bootc compute node. - job: name: cifmw-crc-podified-edpm-baremetal-bootc - nodeset: centos-9-crc-2-48-0-6xlarge + nodeset: centos-9-crc-2-56-0-6xlarge parent: cifmw-base-crc-openstack run: ci/playbooks/edpm_baremetal_deployment/run.yml vars: @@ -51,7 +51,7 @@ # First deploys with pre-update index image (Phase 1), then updates with PR index image (Phase 2). - job: name: cifmw-crc-podified-edpm-baremetal-minor-update - nodeset: centos-9-crc-2-48-0-6xlarge + nodeset: centos-9-crc-2-56-0-6xlarge parent: cifmw-base-crc-openstack run: - ci/playbooks/edpm_baremetal_update/run.yml diff --git a/zuul.d/end-to-end.yaml b/zuul.d/end-to-end.yaml index 5c0f1babd..c94ad9d1c 100644 --- a/zuul.d/end-to-end.yaml +++ b/zuul.d/end-to-end.yaml @@ -2,7 +2,7 @@ # cifmw base job - job: name: cifmw-end-to-end-base - nodeset: centos-9-crc-2-48-0-3xl + nodeset: centos-9-crc-2-56-0-3xl parent: base-simple-crc vars: crc_parameters: "--memory 24000 --disk-size 120 --cpus 8" From b852719f8bfb23c6622979034a2722b73389adbd Mon Sep 17 00:00:00 2001 From: Martin Schuppert Date: Wed, 20 May 2026 07:58:12 +0200 Subject: [PATCH 20/22] [cifmw_backup_restore] Fix post-restore validation and cleanup - Wait for compute services and network agents to be ready with retry loops before proceeding to workload validation, preventing tempest from running against a partially recovered control plane - Delete test-operator CRs (Tempest, Tobiko, AnsibleTest, HorizonTest) at the beginning of cleanup while controllers and dependencies are still running, so finalizers get processed properly - Wait for test-operator pods to terminate after CR deletion - Adapt GaleraRestore pod discovery to the shortened resource names from mariadb-operator which drops the galera instance name prefix from generated resources (restore- instead of -restore-). Uses the galerarestore/name label selector when available, with fallback to the old naming convention so this change can land independently of the mariadb-operator PR - Increase control plane ready timeout from 10m to 30m - Fix loop_var collision with _delete_all_of_kind.yml Related-To: https://github.com/openstack-k8s-operators/mariadb-operator/pull/463 Co-Authored-By: Claude Opus 4.6 Signed-off-by: Martin Schuppert --- roles/cifmw_backup_restore/defaults/main.yml | 6 +++- roles/cifmw_backup_restore/tasks/cleanup.yml | 32 +++++++++++++++++ roles/cifmw_backup_restore/tasks/e2e.yml | 34 +++++++++++++++---- roles/cifmw_backup_restore/tasks/restore.yml | 25 +++++++++----- .../templates/06a-galerarestore.yaml.j2 | 2 +- 5 files changed, 83 insertions(+), 16 deletions(-) diff --git a/roles/cifmw_backup_restore/defaults/main.yml b/roles/cifmw_backup_restore/defaults/main.yml index 91493fb82..991819fcc 100644 --- a/roles/cifmw_backup_restore/defaults/main.yml +++ b/roles/cifmw_backup_restore/defaults/main.yml @@ -66,11 +66,15 @@ cifmw_backup_restore_ovn_db_ready_timeout: 5m cifmw_backup_restore_restore_timeout: 900 cifmw_backup_restore_edpm_deploy_timeout: 40m cifmw_backup_restore_infra_ready_timeout: 20m -cifmw_backup_restore_ctlplane_ready_timeout: 10m +cifmw_backup_restore_ctlplane_ready_timeout: 30m cifmw_backup_restore_strict_restore: true cifmw_backup_restore_restore_content: data cifmw_backup_restore_pin_pvcs: false +# Post-restore service readiness +cifmw_backup_restore_service_retry_count: 30 +cifmw_backup_restore_service_retry_delay: 10 + # Cleanup cifmw_backup_restore_cleanup_ctlplane: true cifmw_backup_restore_cleanup_dataplane: true diff --git a/roles/cifmw_backup_restore/tasks/cleanup.yml b/roles/cifmw_backup_restore/tasks/cleanup.yml index 49e161010..d66864156 100644 --- a/roles/cifmw_backup_restore/tasks/cleanup.yml +++ b/roles/cifmw_backup_restore/tasks/cleanup.yml @@ -71,6 +71,38 @@ msg: "Cleanup cancelled by user" when: not (cifmw_backup_restore_auto_ack | bool) and _delete_confirm.user_input != "yes" +# ======================================== +# Test-operator Cleanup (must happen first while controllers and +# their dependencies are still running, so finalizers get processed) +# ======================================== +- name: Delete test-operator CRs + ansible.builtin.include_tasks: _delete_all_of_kind.yml + vars: + _resource_api_version: test.openstack.org/v1beta1 + _resource_kind: "{{ _test_cr_kind }}" + _resource_wait: true + loop: + - Tempest + - Tobiko + - AnsibleTest + - HorizonTest + loop_control: + loop_var: _test_cr_kind + when: cifmw_backup_restore_cleanup_ctlplane | bool + +- name: Wait for test-operator pods to terminate + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + namespace: "{{ cifmw_backup_restore_namespace }}" + label_selectors: + - operator=test-operator + register: _test_pods + until: _test_pods.resources | length == 0 + retries: 12 + delay: 5 + when: cifmw_backup_restore_cleanup_ctlplane | bool + # ======================================== # DataPlane Cleanup # ======================================== diff --git a/roles/cifmw_backup_restore/tasks/e2e.yml b/roles/cifmw_backup_restore/tasks/e2e.yml index b7ae118ae..aff099e58 100644 --- a/roles/cifmw_backup_restore/tasks/e2e.yml +++ b/roles/cifmw_backup_restore/tasks/e2e.yml @@ -131,29 +131,51 @@ _os_exec: >- oc exec -t openstackclient -n {{ cifmw_backup_restore_namespace }} -- block: - - name: Verify compute services are up + - name: Wait for compute services to be up ansible.builtin.shell: | set -o pipefail {{ _os_exec }} openstack compute service list -f json | \ - jq -r '.[] | "\(.Binary) \(.Host) \(.State)"' + jq -e '[.[] | select(.State != "up")] | length == 0' register: _compute_services changed_when: false + retries: "{{ cifmw_backup_restore_service_retry_count }}" + delay: "{{ cifmw_backup_restore_service_retry_delay }}" + until: _compute_services.rc == 0 + + - name: Display compute services + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack compute service list -f json | \ + jq -r '.[] | "\(.Binary) \(.Host) \(.State)"' + register: _compute_services_display + changed_when: false - name: Display compute services ansible.builtin.debug: - msg: "{{ _compute_services.stdout_lines }}" + msg: "{{ _compute_services_display.stdout_lines }}" - - name: Verify network agents are up + - name: Wait for network agents to be alive ansible.builtin.shell: | set -o pipefail {{ _os_exec }} openstack network agent list -f json | \ - jq -r '.[] | "\(.["Agent Type"]) \(.Host) \(.Alive)"' + jq -e '[.[] | select(.Alive != true)] | length == 0' register: _network_agents changed_when: false + retries: "{{ cifmw_backup_restore_service_retry_count }}" + delay: "{{ cifmw_backup_restore_service_retry_delay }}" + until: _network_agents.rc == 0 + + - name: Display network agents + ansible.builtin.shell: | + set -o pipefail + {{ _os_exec }} openstack network agent list -f json | \ + jq -r '.[] | "\(.["Agent Type"]) \(.Host) \(.Alive)"' + register: _network_agents_display + changed_when: false - name: Display network agents ansible.builtin.debug: - msg: "{{ _network_agents.stdout_lines }}" + msg: "{{ _network_agents_display.stdout_lines }}" - name: Get instance info ansible.builtin.shell: | diff --git a/roles/cifmw_backup_restore/tasks/restore.yml b/roles/cifmw_backup_restore/tasks/restore.yml index eae150397..2eed66b72 100644 --- a/roles/cifmw_backup_restore/tasks/restore.yml +++ b/roles/cifmw_backup_restore/tasks/restore.yml @@ -325,19 +325,28 @@ - name: Wait for GaleraRestore pods to be ready ansible.builtin.shell: | - RESTORE_NAME="{{ item }}restore" - BACKUP_SOURCE="{{ item }}" - POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}" - oc wait --for=condition=Ready pod/${POD_NAME} -n {{ cifmw_backup_restore_namespace }} --timeout=120s + POD_NAME=$(oc get pod \ + -l galerarestore/name={{ item }} \ + -n {{ cifmw_backup_restore_namespace }} \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [ -z "${POD_NAME}" ]; then + POD_NAME="{{ item }}-restore-{{ item }}" + fi + oc wait --for=condition=Ready pod/${POD_NAME} \ + -n {{ cifmw_backup_restore_namespace }} --timeout=120s loop: "{{ _galerabackup_list }}" changed_when: false when: _galerabackup_list | length > 0 - name: Execute database restore for each GaleraRestore ansible.builtin.shell: | - RESTORE_NAME="{{ item }}restore" - BACKUP_SOURCE="{{ item }}" - POD_NAME="${BACKUP_SOURCE}-restore-${RESTORE_NAME}" + POD_NAME=$(oc get pod \ + -l galerarestore/name={{ item }} \ + -n {{ cifmw_backup_restore_namespace }} \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [ -z "${POD_NAME}" ]; then + POD_NAME="{{ item }}-restore-{{ item }}" + fi TIMESTAMP="{{ cifmw_backup_restore_backup_timestamp }}" RESTORE_PATTERN="/backup/data/*_${TIMESTAMP}.sql.gz" oc exec -n {{ cifmw_backup_restore_namespace }} ${POD_NAME} -- \ @@ -348,7 +357,7 @@ - name: List GaleraRestore CRs kept for validation ansible.builtin.debug: - msg: "GaleraRestore CR '{{ item }}restore' kept for post-restore validation (cleaned up by cleanup step)" + msg: "GaleraRestore CR '{{ item }}' kept for post-restore validation (cleaned up by cleanup step)" loop: "{{ _galerabackup_list }}" when: _galerabackup_list | length > 0 diff --git a/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 b/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 index 58ee92a6c..e05630f47 100644 --- a/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 +++ b/roles/cifmw_backup_restore/templates/06a-galerarestore.yaml.j2 @@ -4,7 +4,7 @@ apiVersion: mariadb.openstack.org/v1beta1 kind: GaleraRestore metadata: - name: {{ backup_name }}restore + name: {{ backup_name }} namespace: {{ cifmw_backup_restore_namespace }} spec: backupSource: {{ backup_name }} From fea06348eb0ae62be1c3270e7a537b4200fd0145 Mon Sep 17 00:00:00 2001 From: Miguel Angel Nieto Jimenez Date: Wed, 20 May 2026 15:03:01 +0200 Subject: [PATCH 21/22] [ci_gen_kustomize_values] Add per-node bmhLabelSelector to prevent BMH shuffling The OpenStackBaremetalSet operator assigns BMHs to nodeset hostnames non-deterministically (OSPRH-10282), causing compute-0 to get the network config of compute-1 and vice versa. Add bmhLabelSelector with nodeName per node so each edpm-compute-X is deterministically bound to the BMH named compute-X. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Miguel Angel Nieto Jimenez --- .../nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 | 2 +- .../ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 | 2 +- .../edpm-nodeset2-values/values.yaml.j2 | 2 +- .../edpm-nodeset-values/values.yaml.j2 | 2 +- .../edpm-nodeset2-values/values.yaml.j2 | 2 +- .../ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 | 2 +- .../edpm-common-nodeset-values/values.yaml.j2 | 2 +- .../templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 | 2 +- .../templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 | 2 +- .../templates/sriov/edpm-nodeset-values/values.yaml.j2 | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 index 879d949ff..feb781a5b 100644 --- a/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/nfv-ovs-dpdk-sriov-hci/edpm-nodeset-values/values.yaml.j2 @@ -32,7 +32,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 index 9b75cc7e7..15cf7bec2 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset-values/values.yaml.j2 @@ -45,7 +45,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 index 8e6c5b7bb..2d5bb5c8e 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-2nodesets/edpm-nodeset2-values/values.yaml.j2 @@ -45,7 +45,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 index b166ce872..6ed3fcda5 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset-values/values.yaml.j2 @@ -45,7 +45,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 index a3bb0e28e..82f4a0cc0 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6-2nodesets/edpm-nodeset2-values/values.yaml.j2 @@ -45,7 +45,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 index 5f608e64c..8b3e3d292 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-ipv6/edpm-nodeset-values/values.yaml.j2 @@ -32,7 +32,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 index 937e86f54..cb981df34 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov-networker/edpm-common-nodeset-values/values.yaml.j2 @@ -46,7 +46,7 @@ data: nodes: {% for instance in instance_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 index ab1c12103..f66af1414 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk-sriov/edpm-nodeset-values/values.yaml.j2 @@ -32,7 +32,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 index 7204c800e..f71030591 100644 --- a/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/ovs-dpdk/edpm-nodeset-values/values.yaml.j2 @@ -32,7 +32,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} diff --git a/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 b/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 index e2d1aaed0..a0b2c6acd 100644 --- a/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 +++ b/roles/ci_gen_kustomize_values/templates/sriov/edpm-nodeset-values/values.yaml.j2 @@ -32,7 +32,7 @@ data: nodes: {% for instance in instances_names %} {% set node_name = 'edpm-' + instance %} -{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance}) %} +{% set node_config = _original_nodes[node_name] | default({}) | combine({'hostName': instance, 'bmhLabelSelector': {'nodeName': instance}}) %} {{ node_name }}: {{ node_config | to_nice_yaml(indent=2) | indent(8, first=true) }} {% endfor %} From cb79f67edd668c2a62ff0a6a9092c62712246c32 Mon Sep 17 00:00:00 2001 From: gais-ameer-rh Date: Tue, 26 May 2026 18:35:33 +0530 Subject: [PATCH 22/22] [DCN] Add tests for cinderBackups spec.cinder.template.cinderBackup (singluar) in DCN DT is replaced with cinderBackups (plural) to deploy multiple cinder backups per edge sites. Invoking hooks/playbooks/dz_storage_cinder_backups.yaml playbook to validates the behaviour of cinderBackups in DCN scenario. The playbook tests different scenarios of cinder backup creation and restoring the backups across availability zones. --- playbooks/dcn.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/playbooks/dcn.yml b/playbooks/dcn.yml index 853d08836..3b5706233 100644 --- a/playbooks/dcn.yml +++ b/playbooks/dcn.yml @@ -64,6 +64,15 @@ ansible.builtin.include_role: name: ci_dcn_site + - name: Configure storage for Cinder backups + ansible.builtin.command: + cmd: >- + ansible-playbook + {{ playbook_dir }}/../hooks/playbooks/dz_storage_cinder_backups.yaml + -i {{ inventory_file }} + -e cifmw_openshift_namespace={{ cifmw_openshift_namespace | default('openstack') }} + register: cinder_backups_result + - name: The map for az0 contains all AZ backends ansible.builtin.set_fact: az_to_group_map: