From 5d87042225ca5cc314a4613918d0332b007b399c Mon Sep 17 00:00:00 2001 From: Jonas Toelke Date: Fri, 1 May 2026 22:10:08 -0500 Subject: [PATCH] ci: cut over non-release workflows to shared runners Signed-off-by: Jonas Toelke --- .github/workflows/branch-checks.yml | 72 +++++++++++++++++++++-------- .github/workflows/branch-e2e.yml | 4 +- .github/workflows/ci-image.yml | 69 +++++++++++++++++++++++++-- .github/workflows/docker-build.yml | 2 +- .github/workflows/e2e-test.yml | 2 +- CI.md | 11 +++-- architecture/ci-e2e.md | 18 +++++--- 7 files changed, 137 insertions(+), 41 deletions(-) diff --git a/.github/workflows/branch-checks.yml b/.github/workflows/branch-checks.yml index 15bac0ac2..115e0b5c1 100644 --- a/.github/workflows/branch-checks.yml +++ b/.github/workflows/branch-checks.yml @@ -1,21 +1,45 @@ name: Branch Checks on: - pull_request: + push: + branches: + - "pull-request/[0-9]+" + workflow_dispatch: env: CARGO_TERM_COLOR: always CARGO_INCREMENTAL: "0" MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SCCACHE_GHA_ENABLED: "true" permissions: contents: read packages: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: + pr_metadata: + name: Resolve PR metadata + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + outputs: + should_run: ${{ steps.gate.outputs.should_run }} + steps: + - uses: actions/checkout@v6 + + - id: gate + uses: ./.github/actions/pr-gate + mise-lockfile: name: mise Lockfile - runs-on: build-amd64 + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -27,16 +51,7 @@ jobs: - name: Mark workspace as safe for git run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: Detect mise config changes - id: changed - uses: tj-actions/changed-files@aa08304bd477b800d468db44fe10f6c61f7f7b11 # v42.1.0 - with: - files: | - mise.toml - mise.lock - - name: Verify mise.lock is in sync with mise.toml - if: steps.changed.outputs.any_changed == 'true' run: | mise lock if ! git diff --exit-code mise.lock; then @@ -46,7 +61,9 @@ jobs: license-headers: name: License Headers - runs-on: build-amd64 + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -63,11 +80,15 @@ jobs: rust: name: Rust (${{ matrix.runner }}) + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' strategy: fail-fast: false matrix: - runner: [build-amd64, build-arm64] + runner: [linux-amd64-cpu8, linux-arm64-cpu8] runs-on: ${{ matrix.runner }} + env: + SCCACHE_GHA_VERSION: branch-checks-rust-${{ matrix.runner }} container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -79,9 +100,8 @@ jobs: - name: Install tools run: mise install --locked - - name: Configure sccache remote cache - if: vars.SCCACHE_MEMCACHED_ENDPOINT != '' - run: echo "SCCACHE_MEMCACHED_ENDPOINT=${{ vars.SCCACHE_MEMCACHED_ENDPOINT }}" >> "$GITHUB_ENV" + - name: Configure GHA sccache backend + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad # v0.0.9 - name: Cache Rust target and registry uses: Swatinem/rust-cache@779680da715d629ac1d338a641029a2f4372abb5 # v2 @@ -103,14 +123,24 @@ jobs: - name: sccache stats if: always() - run: mise x -- sccache --show-stats + run: | + set +e + stats_bin="${SCCACHE_PATH:-sccache}" + "$stats_bin" --show-stats + status=$? + if [[ $status -ne 0 ]]; then + echo "::warning::sccache stats unavailable (exit $status)" + fi + exit 0 python: name: Python (${{ matrix.runner }}) + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' strategy: fail-fast: false matrix: - runner: [build-amd64, build-arm64] + runner: [linux-amd64-cpu8, linux-arm64-cpu8] runs-on: ${{ matrix.runner }} container: image: ghcr.io/nvidia/openshell/ci:latest @@ -134,7 +164,9 @@ jobs: markdown: name: Markdown - runs-on: build-amd64 + needs: pr_metadata + if: needs.pr_metadata.outputs.should_run == 'true' + runs-on: linux-amd64-cpu8 container: image: ghcr.io/nvidia/openshell/ci:latest credentials: @@ -144,7 +176,7 @@ jobs: - uses: actions/checkout@v6 - name: Install tools - run: mise install + run: mise install --locked - name: Lint run: mise run markdown:lint diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index a680bca4a..4519ccd36 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -34,7 +34,6 @@ jobs: with: component: gateway platform: linux/arm64 - runner: build-arm64 build-cluster: needs: [pr_metadata] @@ -46,7 +45,6 @@ jobs: with: component: cluster platform: linux/arm64 - runner: build-arm64 e2e: needs: [pr_metadata, build-gateway, build-cluster] @@ -57,4 +55,4 @@ jobs: uses: ./.github/workflows/e2e-test.yml with: image-tag: ${{ github.sha }} - runner: build-arm64 + runner: linux-arm64-cpu8 diff --git a/.github/workflows/ci-image.yml b/.github/workflows/ci-image.yml index 1f59438cc..8b0509169 100644 --- a/.github/workflows/ci-image.yml +++ b/.github/workflows/ci-image.yml @@ -21,8 +21,19 @@ permissions: jobs: build-ci-image: - name: Build - runs-on: build-amd64 + name: Build (${{ matrix.arch }}) + strategy: + fail-fast: false + matrix: + include: + - arch: amd64 + platform: linux/amd64 + runner: linux-amd64-cpu8 + - arch: arm64 + platform: linux/arm64 + runner: linux-arm64-cpu8 + runs-on: ${{ matrix.runner }} + timeout-minutes: 60 steps: - uses: actions/checkout@v6 @@ -33,18 +44,66 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Resolve BuildKit config + id: buildkit + run: | + if [[ -r /etc/buildkit/buildkitd.toml ]]; then + echo "config=/etc/buildkit/buildkitd.toml" >> "$GITHUB_OUTPUT" + else + echo "config=" >> "$GITHUB_OUTPUT" + fi + - name: Set up Docker Buildx uses: ./.github/actions/setup-buildx + with: + driver: local + buildkitd-config: ${{ steps.buildkit.outputs.config }} - name: Build and push CI image env: MISE_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ARCH_IMAGE: ${{ env.CI_IMAGE }}:${{ github.sha }}-${{ matrix.arch }} run: | + set -euo pipefail docker buildx build \ - --platform linux/amd64,linux/arm64 \ + --builder openshell \ + --platform "${{ matrix.platform }}" \ --secret id=MISE_GITHUB_TOKEN,env=MISE_GITHUB_TOKEN \ + --cache-from "type=gha,scope=ci-image-${{ matrix.arch }}" \ + --cache-to "type=gha,mode=max,scope=ci-image-${{ matrix.arch }}" \ --push \ - -t ${{ env.CI_IMAGE }}:${{ github.sha }} \ - -t ${{ env.CI_IMAGE }}:latest \ + -t "$ARCH_IMAGE" \ -f deploy/docker/Dockerfile.ci \ . + + - name: Smoke check CI image + env: + ARCH_IMAGE: ${{ env.CI_IMAGE }}:${{ github.sha }}-${{ matrix.arch }} + run: | + set -euo pipefail + docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" mise --version + docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" gh --version + docker run --rm --platform "${{ matrix.platform }}" "$ARCH_IMAGE" docker buildx version + + merge-ci-image: + name: Merge manifest + needs: build-ci-image + runs-on: linux-amd64-cpu8 + timeout-minutes: 10 + steps: + - name: Log in to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Create multi-arch manifests + run: | + set -euo pipefail + docker buildx imagetools create \ + --prefer-index=false \ + -t "${CI_IMAGE}:${GITHUB_SHA}" \ + -t "${CI_IMAGE}:latest" \ + "${CI_IMAGE}:${GITHUB_SHA}-amd64" \ + "${CI_IMAGE}:${GITHUB_SHA}-arm64" diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 6da7370e5..3b3aa1cb8 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -26,7 +26,7 @@ on: description: "Deprecated; per-arch native runners are selected automatically" required: false type: string - default: "build-amd64" + default: "linux-amd64-cpu8" cargo-version: description: "Pre-computed cargo version (skips internal git-based computation)" required: false diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 992065b25..d34576863 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -11,7 +11,7 @@ on: description: "GitHub Actions runner label" required: false type: string - default: "build-amd64" + default: "linux-amd64-cpu8" permissions: contents: read diff --git a/CI.md b/CI.md index dc5ecf9fa..6b0d9bcfb 100644 --- a/CI.md +++ b/CI.md @@ -6,7 +6,9 @@ For local test commands see [TESTING.md](TESTING.md). For PR conventions see [CO ## Overview -E2E tests run on self-hosted runners (`build-arm64`, GPU runners). To keep untrusted PR code off those runners we use NVIDIA's copy-pr-bot, which mirrors trusted PR commits to internal `pull-request/` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR. +PR CI that runs on NVIDIA self-hosted runners uses NVIDIA's copy-pr-bot. The bot mirrors trusted PR commits to internal `pull-request/` branches in this repository. The gated workflows trigger on pushes to those branches, not on the original PR. + +`Branch Checks` run automatically after copy-pr-bot mirrors the PR. E2E suites are opt-in because they are more expensive and publish temporary images. Two opt-in labels enable the suites: @@ -63,11 +65,11 @@ Prerequisites: Flow: 1. Open the PR. copy-pr-bot mirrors it to `pull-request/` automatically. -2. The first push of `pull-request/` runs `Branch E2E Checks`, but it skips the build/E2E jobs because no label is set yet. The PR's `E2E Gate` check passes as a no-op (no label, no requirement). -3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing `Branch E2E Checks` run. +2. The mirror push runs `Branch Checks` automatically. The first `Branch E2E Checks` / `GPU Test` run only resolves metadata and skips expensive jobs unless the matching label is already set. +3. A maintainer applies `test:e2e` and/or `test:e2e-gpu`. `E2E Label Help` posts a comment with a link to the existing gated workflow run. 4. The maintainer opens that link and clicks **Re-run all jobs**. This time `pr_metadata` sees the label and the build/E2E jobs run. 5. When the run finishes, the `E2E Gate` check on the PR flips to green automatically. -6. New commits push to the mirror automatically and re-trigger `Branch E2E Checks`. Because the label is still set, those runs execute the build/E2E jobs without manual re-run. +6. New commits push to the mirror automatically and re-trigger `Branch Checks` plus any labeled E2E/GPU workflows. ### Forked PR @@ -102,6 +104,7 @@ The bot's full administrator documentation is internal to NVIDIA. The only comma | File | Role | |---|---| +| `.github/workflows/branch-checks.yml` | Required non-E2E PR checks. Triggers on `push: pull-request/[0-9]+`. | | `.github/workflows/branch-e2e.yml` | Non-GPU E2E. Triggers on `push: pull-request/[0-9]+`. | | `.github/workflows/test-gpu.yml` | GPU E2E. Triggers on `push: pull-request/[0-9]+`. | | `.github/actions/pr-gate/action.yml` | Composite action that resolves PR metadata and verifies the required label is set. | diff --git a/architecture/ci-e2e.md b/architecture/ci-e2e.md index d15971e92..45b8e1891 100644 --- a/architecture/ci-e2e.md +++ b/architecture/ci-e2e.md @@ -6,7 +6,7 @@ This document describes the architecture of the E2E CI flow: every workflow invo Three independent goals shape the design: -1. **Self-hosted runner safety.** E2E runs on `build-arm64` and on GPU runners. GitHub's [security hardening guide](https://docs.github.com/en/actions/security-for-github-actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners) states bluntly: "Self-hosted runners should almost never be used for public repositories on GitHub, because any user can open pull requests against the repository and compromise the environment." Our workaround is the same one used elsewhere in NVIDIA's GHA infrastructure: copy-pr-bot mirrors trusted PRs into `pull-request/` branches inside this repository, and the self-hosted workflows trigger on `push` to those mirror branches rather than on `pull_request`. +1. **Self-hosted runner safety.** Required PR checks, E2E, and GPU tests run on NVIDIA self-hosted runners. GitHub's [security hardening guide](https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#hardening-for-self-hosted-runners) states bluntly: "Self-hosted runners should almost never be used for public repositories on GitHub, because any user can open pull requests against the repository and compromise the environment." Our workaround is the same one used elsewhere in NVIDIA's GHA infrastructure: copy-pr-bot mirrors trusted PRs into `pull-request/` branches inside this repository, and the self-hosted workflows trigger on `push` to those mirror branches rather than on `pull_request`. 2. **Label as a hard merge gate.** When a PR carries `test:e2e` (or `test:e2e-gpu`), the corresponding suite *must* have actually executed and passed for the PR head SHA. The label has to be enforcing, not advisory: it blocks merge unless the suite ran with the label set. 3. **Per-job least privilege on the GitHub token.** Each workflow declares `permissions: {}` at the top, and each job declares only what it needs. This follows the hardening pattern described at . @@ -17,20 +17,21 @@ These three goals do not compose cleanly: the safety goal forces `push: pull-req | File | Trigger | Role | |---|---|---| | `.github/copy-pr-bot.yaml` | (config) | Tells copy-pr-bot to mirror trusted PRs into `pull-request/` branches. Pre-existed. | -| `.github/workflows/branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs non-GPU E2E on `build-arm64`. | +| `.github/workflows/branch-checks.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs required branch checks on `linux-amd64-cpu8` and `linux-arm64-cpu8`. | +| `.github/workflows/branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs non-GPU E2E on `linux-arm64-cpu8`. | | `.github/workflows/test-gpu.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Runs GPU E2E on self-hosted GPU runners. | -| `.github/workflows/shadow-branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Non-required shared-runner E2E shadow coverage for OS-49 Phase 5. | +| `.github/workflows/shadow-branch-e2e.yml` | `push: pull-request/[0-9]+` + `workflow_dispatch` | Historical non-required shared-runner E2E shadow coverage for OS-49 Phase 5. | | `.github/actions/pr-gate/action.yml` | (composite) | Resolves PR metadata for a `pull-request/` push and decides whether the run should proceed. Label enforcement is optional, so non-required shadows can validate mirror metadata without introducing another PR label. | | `.github/workflows/e2e-gate.yml` | `pull_request` + `workflow_run` | Posts the required `E2E Gate` check on the PR. Re-evaluates after the gated workflow completes. | | `.github/workflows/e2e-gate-check.yml` | `workflow_call` | Reusable gate logic shared by E2E and GPU E2E. | | `.github/workflows/e2e-label-help.yml` | `pull_request_target: [labeled]` | Posts a PR comment when a `test:e2e*` label is applied, telling the maintainer the next manual step (re-run an existing run, or `/ok to test ` to refresh the mirror). Does *not* dispatch the workflow itself - see "Why we don't auto-dispatch" below. | | `.github/workflows/e2e-test.yml`, `e2e-gpu-test.yaml`, `docker-build.yml` | `workflow_call` | Reusable worker workflows. Unchanged by this design - called from the gated workflows and from release workflows. | -## OS-49 shadow runner coverage +## OS-49 runner migration -OS-49 Phase 5 adds non-required shadow workflows for the non-release workflows being prepared for shared-runner cutover. They all use `workflow_dispatch` for manual bake runs and `push: pull-request/[0-9]+` for copy-pr-bot mirrored PRs. +OS-49 Phase 5 added non-required shadow workflows for the non-release workflows being prepared for shared-runner cutover. Phase 6 promotes the validated shared-runner path into the real non-release workflows. -`shadow-branch-checks.yml` and `shadow-ci-image.yml` use `pr-gate` without a required label. That still verifies the mirror SHA matches the source PR head SHA, but does not require a new GitHub label for every ordinary CI shadow run. `shadow-branch-e2e.yml` keeps the existing `test:e2e` gate because it publishes temporary images and runs the expensive E2E suite. It shadows the top-level `branch-e2e.yml` workflow, which already exercises the reusable `e2e-test.yml` worker path, so Phase 5 does not keep a second direct `e2e-test.yml` shadow workflow. +`branch-checks.yml` uses `pr-gate` without a required label. That still verifies the mirror SHA matches the source PR head SHA, but does not require a new GitHub label for ordinary required checks. `branch-e2e.yml` keeps the existing `test:e2e` gate because it publishes temporary images and runs the expensive E2E suite. `ci-image.yml` now builds amd64 and arm64 CI images natively on shared CPU runners and merges the multi-arch manifest after both per-arch images are pushed. ## Trigger taxonomy @@ -147,10 +148,13 @@ Labels persist as PR metadata and survive re-runs and force-pushes. Comment-base ## Permission posture -Every workflow declares `permissions: {}` at the top. Per-job grants are the minimum needed: +The gated E2E workflows declare `permissions: {}` at the top. Branch checks and CI image publishing use the minimum workflow/job grants needed for checkout, package pulls, and package pushes. | Workflow | Job | Grants | |---|---|---| +| `branch-checks.yml` | workflow default | `contents: read`, `packages: read` | +| | `pr_metadata` | `contents: read`, `pull-requests: read` | +| `ci-image.yml` | workflow default | `contents: read`, `packages: write` | | `branch-e2e.yml`, `test-gpu.yml` | `pr_metadata` | `contents: read`, `pull-requests: read` | | | `build-*` | `contents: read`, `packages: write` | | | `e2e*` | `contents: read`, `packages: read` |