From c55de89ec4f5b2a69831d5b0c0cb85f6a7c85078 Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Sun, 7 Jun 2026 22:32:17 +0900 Subject: [PATCH] fix(ci): retry docker pulls in integration Preload Images step The integration matrix jobs intermittently failed in the "Preload Images" step when a `docker pull` from Docker Hub timed out: `Get "https://registry-1.docker.io/v2/": context deadline exceeded`. The step ran bare `docker pull`s with no retry, so a single transient registry hiccup failed the whole job (issue #7598). Wrap each pull in a small `retry()` helper (3 attempts, 5s then 10s exponential backoff). Transient failures are retried; a genuine, persistent failure still fails the step because the final `return 1` propagates under `bash -e`. `docker login` is not an option here: the integration job runs on fork PRs, which have no repository secrets. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Sandy Chen --- .github/workflows/test-build-deploy.yml | 46 +++++++++++++++++-------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test-build-deploy.yml b/.github/workflows/test-build-deploy.yml index 1dd1b4fa5a..9f6d938943 100644 --- a/.github/workflows/test-build-deploy.yml +++ b/.github/workflows/test-build-deploy.yml @@ -258,24 +258,42 @@ jobs: - name: Preload Images # We download docker images used by integration tests so that all images are available # locally and the download time doesn't account in the test execution time, which is subject - # to a timeout + # to a timeout. Each pull is wrapped in a small retry helper so a transient Docker Hub / + # quay.io registry hiccup (e.g. "context deadline exceeded") doesn't fail the whole job. run: | - docker pull minio/minio:RELEASE.2024-05-28T17-19-04Z - docker pull consul:1.8.4 - docker pull quay.io/coreos/etcd:v3.5.29 + # Retry a command up to 3 times with exponential backoff (5s, then 10s). A transient + # docker pull failure is retried; a genuine, persistent failure still fails the step + # because the final `return 1` propagates under `bash -e`. + retry() { + local max_attempts=3 attempt=1 delay=5 + until "$@"; do + if [ "$attempt" -ge "$max_attempts" ]; then + echo "ERROR: '$*' failed after ${max_attempts} attempts." >&2 + return 1 + fi + echo "WARNING: '$*' failed (attempt ${attempt}/${max_attempts}); retrying in ${delay}s..." >&2 + sleep "$delay" + attempt=$((attempt + 1)) + delay=$((delay * 2)) + done + } + + retry docker pull minio/minio:RELEASE.2024-05-28T17-19-04Z + retry docker pull consul:1.8.4 + retry docker pull quay.io/coreos/etcd:v3.5.29 if [ "$TEST_TAGS" = "integration_backward_compatibility" ]; then - docker pull quay.io/cortexproject/cortex:v1.16.1 - docker pull quay.io/cortexproject/cortex:v1.17.2 - docker pull quay.io/cortexproject/cortex:v1.18.1 - docker pull quay.io/cortexproject/cortex:v1.19.1 - docker pull quay.io/cortexproject/cortex:v1.20.1 - docker pull quay.io/cortexproject/cortex:v1.21.0 + retry docker pull quay.io/cortexproject/cortex:v1.16.1 + retry docker pull quay.io/cortexproject/cortex:v1.17.2 + retry docker pull quay.io/cortexproject/cortex:v1.18.1 + retry docker pull quay.io/cortexproject/cortex:v1.19.1 + retry docker pull quay.io/cortexproject/cortex:v1.20.1 + retry docker pull quay.io/cortexproject/cortex:v1.21.0 elif [ "$TEST_TAGS" = "integration_query_fuzz" ]; then - docker pull quay.io/cortexproject/cortex:v1.20.1 - docker pull quay.io/prometheus/prometheus:v3.8.1 + retry docker pull quay.io/cortexproject/cortex:v1.20.1 + retry docker pull quay.io/prometheus/prometheus:v3.8.1 fi - docker pull memcached:1.6.1 - docker pull redis:7.0.4-alpine + retry docker pull memcached:1.6.1 + retry docker pull redis:7.0.4-alpine env: TEST_TAGS: ${{ matrix.tags }} - name: Integration Tests