NVIDIA · jtoelke2 · May 2, 2026
@@ -1,21 +1,45 @@
 name: Branch Checks
 
 on:
-  pull_request:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+  workflow_dispatch:
 
 env:
   CARGO_TERM_COLOR: always
   CARGO_INCREMENTAL: "0"
   MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  SCCACHE_GHA_ENABLED: "true"
 
 permissions:
   contents: read
   packages: read
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
+  pr_metadata:
+    name: Resolve PR metadata
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+    outputs:
+      should_run: ${{ steps.gate.outputs.should_run }}
+    steps:
+      - uses: actions/checkout@v6
+
+      - id: gate
+        uses: ./.github/actions/pr-gate
+
   mise-lockfile:
     name: mise Lockfile
-    runs-on: build-amd64
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
@@ -27,16 +51,7 @@ jobs:
       - name: Mark workspace as safe for git
         run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
 
-      - name: Detect mise config changes
-        id: changed
-        uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11 # v42.1.0
-        with:
-          files: |
-            mise.toml
-            mise.lock
-
       - name: Verify mise.lock is in sync with mise.toml
-        if: steps.changed.outputs.any_changed == 'true'
         run: |
           mise lock
           if ! git diff --exit-code mise.lock; then
@@ -46,7 +61,9 @@ jobs:
 
   license-headers:
     name: License Headers
-    runs-on: build-amd64
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
@@ -63,11 +80,15 @@ jobs:
 
   rust:
     name: Rust (${{ matrix.runner }})
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
     strategy:
       fail-fast: false
       matrix:
-        runner: [build-amd64, build-arm64]
+        runner: [linux-amd64-cpu8, linux-arm64-cpu8]
     runs-on: ${{ matrix.runner }}
+    env:
+      SCCACHE_GHA_VERSION: branch-checks-rust-${{ matrix.runner }}
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
@@ -79,9 +100,8 @@ jobs:
       - name: Install tools
         run: mise install --locked
 
-      - name: Configure sccache remote cache
-        if: vars.SCCACHE_MEMCACHED_ENDPOINT != ''
-        run: echo "SCCACHE_MEMCACHED_ENDPOINT=${{ vars.SCCACHE_MEMCACHED_ENDPOINT }}" >> "$GITHUB_ENV"
+      - name: Configure GHA sccache backend
+        uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9
 
       - name: Cache Rust target and registry
         uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2
@@ -103,14 +123,24 @@ jobs:
 
       - name: sccache stats
         if: always()
-        run: mise x -- sccache --show-stats
+        run: |
+          set +e
+          stats_bin="${SCCACHE_PATH:-sccache}"
+          "$stats_bin" --show-stats
+          status=$?
+          if [[ $status -ne 0 ]]; then
+            echo "::warning::sccache stats unavailable (exit $status)"
+          fi
+          exit 0
 
   python:
     name: Python (${{ matrix.runner }})
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
     strategy:
       fail-fast: false
       matrix:
-        runner: [build-amd64, build-arm64]
+        runner: [linux-amd64-cpu8, linux-arm64-cpu8]
     runs-on: ${{ matrix.runner }}
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
@@ -134,7 +164,9 @@ jobs:
 
   markdown:
     name: Markdown
-    runs-on: build-amd64
+    needs: pr_metadata
+    if: needs.pr_metadata.outputs.should_run == 'true'
+    runs-on: linux-amd64-cpu8
     container:
       image: ghcr.io/nvidia/openshell/ci:latest
       credentials:
@@ -144,7 +176,7 @@ jobs:
       - uses: actions/checkout@v6
 
       - name: Install tools
-        run: mise install
+        run: mise install --locked
 
       - name: Lint
         run: mise run markdown:lint
@@ -34,7 +34,6 @@ jobs:
     with:
       component: gateway
       platform: linux/arm64
-      runner: build-arm64
 
   build-cluster:
     needs: [pr_metadata]
@@ -46,7 +45,6 @@ jobs:
     with:
       component: cluster
       platform: linux/arm64
-      runner: build-arm64
 
   e2e:
     needs: [pr_metadata, build-gateway, build-cluster]
@@ -57,4 +55,4 @@ jobs:
     uses: ./.github/workflows/e2e-test.yml
     with:
       image-tag: ${{ github.sha }}
-      runner: build-arm64
+      runner: linux-arm64-cpu8
@@ -21,8 +21,19 @@ permissions:
 
 jobs:
   build-ci-image:
-    name: Build
-    runs-on: build-amd64
+    name: Build (${{ matrix.arch }})
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: amd64
+            platform: linux/amd64
+            runner: linux-amd64-cpu8
+          - arch: arm64
+            platform: linux/arm64
+            runner: linux-arm64-cpu8
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 60
     steps:
       - uses: actions/checkout@v6
 
@@ -33,18 +44,66 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Resolve BuildKit config
+        id: buildkit
+        run: |
+          if [[ -r /etc/buildkit/buildkitd.toml ]]; then
+            echo "config=/etc/buildkit/buildkitd.toml" >> "$GITHUB_OUTPUT"
+          else
+            echo "config=" >> "$GITHUB_OUTPUT"
+          fi
+
       - name: Set up Docker Buildx
         uses: ./.github/actions/setup-buildx
+        with:
+          driver: local
+          buildkitd-config: ${{ steps.buildkit.outputs.config }}
 
       - name: Build and push CI image
         env:
           MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          ARCH_IMAGE: ${{ env.CI_IMAGE }}:${{ github.sha }}-${{ matrix.arch }}
         run: |
+          set -euo pipefail
           docker buildx build \
-            --platform linux/amd64,linux/arm64 \
+            --builder openshell \
+            --platform "${{ matrix.platform }}" \
             --secret id=MISE_GITHUB_TOKEN,env=MISE_GITHUB_TOKEN \
+            --cache-from "type=gha,scope=ci-image-${{ matrix.arch }}" \
+            --cache-to "type=gha,mode=max,scope=ci-image-${{ matrix.arch }}" \
             --push \
-            -t ${{ env.CI_IMAGE }}:${{ github.sha }} \
-            -t ${{ env.CI_IMAGE }}:latest \
+            -t "$ARCH_IMAGE" \
             -f deploy/docker/Dockerfile.ci \
             .
+
+      - name: Smoke check CI image
+        env:
+          ARCH_IMAGE: ${{ env.CI_IMAGE }}:${{ github.sha }}-${{ matrix.arch }}
+        run: |
+          set -euo pipefail
+          docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" mise --version
+          docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" gh --version
+          docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" docker buildx version
+
+  merge-ci-image:
+    name: Merge manifest
+    needs: build-ci-image
+    runs-on: linux-amd64-cpu8
+    timeout-minutes: 10
+    steps:
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Create multi-arch manifests
+        run: |
+          set -euo pipefail
+          docker buildx imagetools create \
+            --prefer-index=false \
+            -t "${CI_IMAGE}:${GITHUB_SHA}" \
+            -t "${CI_IMAGE}:latest" \
+            "${CI_IMAGE}:${GITHUB_SHA}-amd64" \
+            "${CI_IMAGE}:${GITHUB_SHA}-arm64"
@@ -26,7 +26,7 @@ on:
         description: "Deprecated; per-arch native runners are selected automatically"
         required: false
         type: string
-        default: "build-amd64"
+        default: "linux-amd64-cpu8"
       cargo-version:
         description: "Pre-computed cargo version (skips internal git-based computation)"
         required: false

@@ -11,7 +11,7 @@ on:
         description: "GitHub Actions runner label"
         required: false
         type: string
-        default: "build-amd64"
+        default: "linux-amd64-cpu8"
 
 permissions:
   contents: read

@@ -6,7 +6,9 @@ For local test commands see [TESTING.md](TESTING.md). For PR conventions see [CO
 
 ## Overview
 
-E2E tests run on self-hosted runners (`build-arm64`, GPU runners). To keep untrusted PR code off those runners we use NVIDIA's copy-pr-bot, which mirrors trusted PR commits to internal `pull-request/<N>` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR.
+PR CI that runs on NVIDIA self-hosted runners uses NVIDIA's copy-pr-bot. The bot mirrors trusted PR commits to internal `pull-request/<N>` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR.
+
+`Branch Checks` run automatically after copy-pr-bot mirrors the PR. E2E suites are opt-in because they are more expensive and publish temporary images.
 
 Two opt-in labels enable the suites:
 
@@ -63,11 +65,11 @@ Prerequisites:
 Flow:
 
 1. Open the PR. copy-pr-bot mirrors it to `pull-request/<N>` automatically.
-2. The first push of `pull-request/<N>` runs `Branch E2E Checks`, but it skips the build/E2E jobs because no label is set yet. The PR's `E2E Gate` check passes as a no-op (no label, no requirement).
-3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing `Branch E2E Checks` run.
+2. The mirror push runs `Branch Checks` automatically. The first `Branch E2E Checks` / `GPU Test` run only resolves metadata and skips expensive jobs unless the matching label is already set.
+3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing gated workflow run.
 4. The maintainer opens that link and clicks **Re-run all jobs**. This time `pr_metadata` sees the label and the build/E2E jobs run.
 5. When the run finishes, the `E2E Gate` check on the PR flips to green automatically.
-6. New commits push to the mirror automatically and re-trigger `Branch E2E Checks`. Because the label is still set, those runs execute the build/E2E jobs without manual re-run.
+6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E/GPU workflows.
 
 ### Forked PR
 
@@ -102,6 +104,7 @@ The bot's full administrator documentation is internal to NVIDIA. The only comma
 
 | File | Role |
 |---|---|
+| `.github/workflows/branch-checks.yml` | Required non-E2E PR checks. Triggers on `push: pull-request/[0-9]+`. |
 | `.github/workflows/branch-e2e.yml` | Non-GPU E2E. Triggers on `push: pull-request/[0-9]+`. |
 | `.github/workflows/test-gpu.yml` | GPU E2E. Triggers on `push: pull-request/[0-9]+`. |
 | `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. |

@@ -6,7 +6,7 @@ This document describes the architecture of the E2E CI flow: every workflow invo
 
 Three independent goals shape the design:
 
-1. **Self-hosted runner safety.** E2E runs on `build-arm64` and on GPU runners. GitHub's [security hardening guide](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners) states bluntly: "Self-hosted runners should almost never be used for public repositories on GitHub, because any user can open pull requests against the repository and compromise the environment." Our workaround is the same one used elsewhere in NVIDIA's GHA infrastructure: copy-pr-bot mirrors trusted PRs into `pull-request/<N>` branches inside this repository, and the self-hosted workflows trigger on `push` to those mirror branches rather than on `pull_request`.
+1. **Self-hosted runner safety.** Required PR checks, E2E, and GPU tests run on NVIDIA self-hosted runners. GitHub's [security hardening guide](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners) states bluntly: "Self-hosted runners should almost never be used for public repositories on GitHub, because any user can open pull requests against the repository and compromise the environment." Our workaround is the same one used elsewhere in NVIDIA's GHA infrastructure: copy-pr-bot mirrors trusted PRs into `pull-request/<N>` branches inside this repository, and the self-hosted workflows trigger on `push` to those mirror branches rather than on `pull_request`.
 2. **Label as a hard merge gate.** When a PR carries `test:e2e` (or `test:e2e-gpu`), the corresponding suite *must* have actually executed and passed for the PR head SHA. The label has to be enforcing, not advisory: it blocks merge unless the suite ran with the label set.
 3. **Per-job least privilege on the GitHub token.** Each workflow declares `permissions: {}` at the top, and each job declares only what it needs. This follows the hardening pattern described at <https://astral.sh/blog/open-source-security-at-astral>.
 
@@ -17,20 +17,21 @@ These three goals do not compose cleanly: the safety goal forces `push: pull-req
 | File | Trigger | Role |
 |---|---|---|
 | `.github/copy-pr-bot.yaml` | (config) | Tells copy-pr-bot to mirror trusted PRs into `pull-request/<N>` branches. Pre-existed. |
-| `.github/workflows/branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs non-GPU E2E on `build-arm64`. |
+| `.github/workflows/branch-checks.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs required branch checks on `linux-amd64-cpu8` and `linux-arm64-cpu8`. |
+| `.github/workflows/branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs non-GPU E2E on `linux-arm64-cpu8`. |
 | `.github/workflows/test-gpu.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs GPU E2E on self-hosted GPU runners. |
-| `.github/workflows/shadow-branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Non-required shared-runner E2E shadow coverage for OS-49 Phase 5. |
+| `.github/workflows/shadow-branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Historical non-required shared-runner E2E shadow coverage for OS-49 Phase 5. |
 | `.github/actions/pr-gate/action.yml` | (composite) | Resolves PR metadata for a `pull-request/<N>` push and decides whether the run should proceed. Label enforcement is optional, so non-required shadows can validate mirror metadata without introducing another PR label. |
 | `.github/workflows/e2e-gate.yml` | `pull_request` + `workflow_run` | Posts the required `E2E Gate` check on the PR. Re-evaluates after the gated workflow completes. |
 | `.github/workflows/e2e-gate-check.yml` | `workflow_call` | Reusable gate logic shared by E2E and GPU E2E. |
 | `.github/workflows/e2e-label-help.yml` | `pull_request_target: [labeled]` | Posts a PR comment when a `test:e2e*` label is applied, telling the maintainer the next manual step (re-run an existing run, or `/ok to test <SHA>` to refresh the mirror). Does *not* dispatch the workflow itself - see "Why we don't auto-dispatch" below. |
 | `.github/workflows/e2e-test.yml`, `e2e-gpu-test.yaml`, `docker-build.yml` | `workflow_call` | Reusable worker workflows. Unchanged by this design - called from the gated workflows and from release workflows. |
 
-## OS-49 shadow runner coverage
+## OS-49 runner migration
 
-OS-49 Phase 5 adds non-required shadow workflows for the non-release workflows being prepared for shared-runner cutover. They all use `workflow_dispatch` for manual bake runs and `push: pull-request/[0-9]+` for copy-pr-bot mirrored PRs.
+OS-49 Phase 5 added non-required shadow workflows for the non-release workflows being prepared for shared-runner cutover. Phase 6 promotes the validated shared-runner path into the real non-release workflows.
 
-`shadow-branch-checks.yml` and `shadow-ci-image.yml` use `pr-gate` without a required label. That still verifies the mirror SHA matches the source PR head SHA, but does not require a new GitHub label for every ordinary CI shadow run. `shadow-branch-e2e.yml` keeps the existing `test:e2e` gate because it publishes temporary images and runs the expensive E2E suite. It shadows the top-level `branch-e2e.yml` workflow, which already exercises the reusable `e2e-test.yml` worker path, so Phase 5 does not keep a second direct `e2e-test.yml` shadow workflow.
+`branch-checks.yml` uses `pr-gate` without a required label. That still verifies the mirror SHA matches the source PR head SHA, but does not require a new GitHub label for ordinary required checks. `branch-e2e.yml` keeps the existing `test:e2e` gate because it publishes temporary images and runs the expensive E2E suite. `ci-image.yml` now builds amd64 and arm64 CI images natively on shared CPU runners and merges the multi-arch manifest after both per-arch images are pushed.
 
 ## Trigger taxonomy
 
@@ -147,10 +148,13 @@ Labels persist as PR metadata and survive re-runs and force-pushes. Comment-base
 
 ## Permission posture
 
-Every workflow declares `permissions: {}` at the top. Per-job grants are the minimum needed:
+The gated E2E workflows declare `permissions: {}` at the top. Branch checks and CI image publishing use the minimum workflow/job grants needed for checkout, package pulls, and package pushes.
 
 | Workflow | Job | Grants |
 |---|---|---|
+| `branch-checks.yml` | workflow default | `contents: read`, `packages: read` |
+| | `pr_metadata` | `contents: read`, `pull-requests: read` |
+| `ci-image.yml` | workflow default | `contents: read`, `packages: write` |
 | `branch-e2e.yml`, `test-gpu.yml` | `pr_metadata` | `contents: read`, `pull-requests: read` |
 | | `build-*` | `contents: read`, `packages: write` |
 | | `e2e*` | `contents: read`, `packages: read` |