From 61b582bf6dcf4ae6954ebf929827246c8a130bb6 Mon Sep 17 00:00:00 2001 From: Titus Fortner Date: Sun, 24 May 2026 20:08:22 -0500 Subject: [PATCH 1/2] [ci] retry Bazel on transient GitHub CDN 5xx errors during repo fetch --- .github/workflows/bazel.yml | 6 +-- .../github-actions/run-bazel-with-retry.sh | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) create mode 100755 scripts/github-actions/run-bazel-with-retry.sh diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml index 6f7a4fa33f820..b6e973ebe4c4e 100644 --- a/.github/workflows/bazel.yml +++ b/.github/workflows/bazel.yml @@ -298,11 +298,7 @@ jobs: env: MSYS_NO_PATHCONV: 1 MSYS2_ARG_CONV_EXCL: "*" - run: | - mkdir -p build - { - ${{ inputs.run }} - } 2>&1 | tee build/bazel-console.log + run: ./scripts/github-actions/run-bazel-with-retry.sh "${{ inputs.run }}" - name: Rerun failures with debug if: failure() && steps.run-bazel.outcome == 'failure' shell: bash diff --git a/scripts/github-actions/run-bazel-with-retry.sh b/scripts/github-actions/run-bazel-with-retry.sh new file mode 100755 index 0000000000000..c0532ba09c153 --- /dev/null +++ b/scripts/github-actions/run-bazel-with-retry.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Run the given Bazel command, retrying up to a few times on transient +# GitHub CDN errors (HTTP 5xx during repo fetch). For any other failure, +# exits with Bazel's actual exit code so downstream "rerun with debug" +# behavior triggers normally. +# +# Usage: run-bazel-with-retry.sh "" + +set -uo pipefail + +CMD="${1:?usage: $0 \"\"}" +LOG_FILE="${BAZEL_CONSOLE_LOG:-build/bazel-console.log}" +BAZEL_MAX_ATTEMPTS=3 +mkdir -p "$(dirname "$LOG_FILE")" + +# Matches Bazel's HttpConnector error format for 502/503/504 responses +BAZEL_ERROR_PATTERN='GET returned 50[234] ' + +for i in $(seq 1 "$BAZEL_MAX_ATTEMPTS"); do + # shellcheck disable=SC2086 # CMD is intentionally evaluated as a shell command + bash -c "$CMD" 2>&1 | tee "$LOG_FILE" + BAZEL_EXIT_CODE=${PIPESTATUS[0]} + + if [ "$BAZEL_EXIT_CODE" -eq 0 ]; then + exit 0 + fi + + if grep -qE "BAZEL_ERROR_PATTERN" "$LOG_FILE"; then + if [ "$i" -ge "$BAZEL_MAX_ATTEMPTS" ]; then + break + fi + SLEEP=$((15 * i)) + { + echo "⚠️ Transient CDN error detected (5xx). Retrying in ${SLEEP}s... (attempt $i of $BAZEL_MAX_ATTEMPTS)" + grep -E "BAZEL_ERROR_PATTERN" "$LOG_FILE" | head -5 + } >&2 + sleep "$SLEEP" + else + exit "$BAZEL_EXIT_CODE" + fi +done + +echo "❌ Exhausted retries for CDN errors after $BAZEL_MAX_ATTEMPTS attempts." >&2 +exit 1 From 2ba409c60eaa29260b0fd39a53bb4ab80966b6a9 Mon Sep 17 00:00:00 2001 From: Titus Fortner Date: Sun, 24 May 2026 21:57:19 -0500 Subject: [PATCH 2/2] use ENV for the inputs.run rather than positional arg so quote syntax doesn't collide --- .github/workflows/bazel.yml | 3 ++- .../github-actions/run-bazel-with-retry.sh | 21 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/bazel.yml b/.github/workflows/bazel.yml index b6e973ebe4c4e..99370f237e2b7 100644 --- a/.github/workflows/bazel.yml +++ b/.github/workflows/bazel.yml @@ -298,7 +298,8 @@ jobs: env: MSYS_NO_PATHCONV: 1 MSYS2_ARG_CONV_EXCL: "*" - run: ./scripts/github-actions/run-bazel-with-retry.sh "${{ inputs.run }}" + BAZEL_RUN_CMD: ${{ inputs.run }} + run: ./scripts/github-actions/run-bazel-with-retry.sh - name: Rerun failures with debug if: failure() && steps.run-bazel.outcome == 'failure' shell: bash diff --git a/scripts/github-actions/run-bazel-with-retry.sh b/scripts/github-actions/run-bazel-with-retry.sh index c0532ba09c153..7fa56248beba0 100755 --- a/scripts/github-actions/run-bazel-with-retry.sh +++ b/scripts/github-actions/run-bazel-with-retry.sh @@ -1,14 +1,14 @@ #!/usr/bin/env bash -# Run the given Bazel command, retrying up to a few times on transient -# GitHub CDN errors (HTTP 5xx during repo fetch). For any other failure, -# exits with Bazel's actual exit code so downstream "rerun with debug" -# behavior triggers normally. +# Run the Bazel command supplied via the BAZEL_RUN_CMD environment variable, +# retrying up to a few times on transient GitHub CDN errors (HTTP 5xx during +# repo fetch). For any other failure, exits with Bazel's actual exit code so +# downstream "rerun with debug" behavior triggers normally. # -# Usage: run-bazel-with-retry.sh "" +# Usage: BAZEL_RUN_CMD="" run-bazel-with-retry.sh set -uo pipefail -CMD="${1:?usage: $0 \"\"}" +: "${BAZEL_RUN_CMD:?usage: BAZEL_RUN_CMD=\"\" $0}" LOG_FILE="${BAZEL_CONSOLE_LOG:-build/bazel-console.log}" BAZEL_MAX_ATTEMPTS=3 mkdir -p "$(dirname "$LOG_FILE")" @@ -17,22 +17,21 @@ mkdir -p "$(dirname "$LOG_FILE")" BAZEL_ERROR_PATTERN='GET returned 50[234] ' for i in $(seq 1 "$BAZEL_MAX_ATTEMPTS"); do - # shellcheck disable=SC2086 # CMD is intentionally evaluated as a shell command - bash -c "$CMD" 2>&1 | tee "$LOG_FILE" + bash -c "$BAZEL_RUN_CMD" 2>&1 | tee "$LOG_FILE" BAZEL_EXIT_CODE=${PIPESTATUS[0]} if [ "$BAZEL_EXIT_CODE" -eq 0 ]; then exit 0 fi - if grep -qE "BAZEL_ERROR_PATTERN" "$LOG_FILE"; then + if grep -qE "$BAZEL_ERROR_PATTERN" "$LOG_FILE"; then if [ "$i" -ge "$BAZEL_MAX_ATTEMPTS" ]; then break fi SLEEP=$((15 * i)) { echo "⚠️ Transient CDN error detected (5xx). Retrying in ${SLEEP}s... (attempt $i of $BAZEL_MAX_ATTEMPTS)" - grep -E "BAZEL_ERROR_PATTERN" "$LOG_FILE" | head -5 + grep -E "$BAZEL_ERROR_PATTERN" "$LOG_FILE" | head -5 } >&2 sleep "$SLEEP" else @@ -41,4 +40,4 @@ for i in $(seq 1 "$BAZEL_MAX_ATTEMPTS"); do done echo "❌ Exhausted retries for CDN errors after $BAZEL_MAX_ATTEMPTS attempts." >&2 -exit 1 +exit "$BAZEL_EXIT_CODE"