diff --git a/.github/workflows/crate-bloat.yml b/.github/workflows/crate-bloat.yml new file mode 100644 index 00000000000..91c2e8333d1 --- /dev/null +++ b/.github/workflows/crate-bloat.yml @@ -0,0 +1,65 @@ +# Reports compiled machine-code (.text) size per Vortex crate as a sticky PR comment. +# Builds the datafusion-bench binary on stable and attributes its .text back to +# each first-party crate with cargo-bloat. + +name: Crate Binary Size + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: [pull_request] + +permissions: + contents: read + pull-requests: write # for commenting on PRs + +jobs: + bloat: + name: Per-crate .text size + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: ./.github/actions/setup-rust + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install cargo-bloat + uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995 + with: + tool: cargo-bloat + + - name: Measure per-crate binary size vs develop + shell: bash + run: | + set -Eeu -o pipefail -x + + bloat() { + cargo bloat --package datafusion-bench --bin datafusion-bench \ + --release --crates -n 0 --message-format json + } + + head_sha=$(git rev-parse HEAD) + + # Measure the PR head. + bloat > bloat-head.json + + # Measure develop on the same machine, reusing the target directory so + # only changed crates are rebuilt. + git fetch --no-tags --depth=1 origin develop + git checkout -f FETCH_HEAD + bloat > bloat-develop.json + + # Restore the PR head and render with its copy of the script. + git checkout -f "$head_sha" + python3 scripts/crate-bloat.py bloat-head.json --base-file bloat-develop.json > comment.md + cat comment.md >> "$GITHUB_STEP_SUMMARY" + + - name: Comment PR + if: github.event.pull_request.head.repo.fork == false + uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3 + with: + file-path: comment.md + comment-tag: crate-bloat-comment diff --git a/scripts/crate-bloat.py b/scripts/crate-bloat.py new file mode 100644 index 00000000000..da764f78019 --- /dev/null +++ b/scripts/crate-bloat.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Render per-crate compiled (.text) size as a collapsible markdown comment. + +Consumes the JSON produced by ``cargo bloat --crates --message-format json`` +for a linked binary, keeps only first-party workspace crates, and prints a +single ``
`` block: the ```` is a one-line Vortex total so the +comment stays compact, and expanding it reveals the full per-crate breakdown of +machine code attributed to each Vortex crate, with deltas against ``develop`` +when a base measurement is provided. +""" + +import argparse +import json +import subprocess + + +def fmt_size(size_bytes: int) -> str: + """Format a byte count using binary units.""" + if abs(size_bytes) >= 1024**2: + return f"{size_bytes / 1024**2:.2f} MiB" + if abs(size_bytes) >= 1024: + return f"{size_bytes / 1024:.1f} KiB" + return f"{size_bytes} B" + + +def fmt_delta(delta: int) -> str: + """Format a signed size delta, or an em dash when unchanged.""" + if delta == 0: + return "—" + return f"{'+' if delta > 0 else '−'}{fmt_size(abs(delta))}" + + +def fmt_pct(base: int, head: int) -> str: + """Format a percentage change, handling newly introduced crates.""" + if base == 0: + return "new" if head > 0 else "—" + if head == base: + return "—" + pct = (head / base - 1) * 100 + return f"{'+' if pct > 0 else '−'}{abs(pct):.1f}%" + + +def workspace_crate_names(manifest_path: str) -> set[str]: + """Return the set of first-party crate names (as cargo-bloat reports them).""" + out = subprocess.run( + ["cargo", "metadata", "--no-deps", "--format-version", "1", "--manifest-path", manifest_path], + capture_output=True, + text=True, + check=True, + ) + metadata = json.loads(out.stdout) + names: set[str] = set() + for pkg in metadata["packages"]: + names.add(pkg["name"].replace("-", "_")) + for target in pkg["targets"]: + names.add(target["name"].replace("-", "_")) + return names + + +def crate_sizes(bloat_file: str, workspace: set[str]) -> dict[str, int]: + """Load a cargo-bloat JSON file and keep only first-party crate sizes.""" + with open(bloat_file) as f: + bloat = json.load(f) + sizes = {c["name"]: c["size"] for c in bloat.get("crates", []) if c["name"] in workspace} + sizes["__text_section_size__"] = bloat.get("text-section-size", 0) + return sizes + + +def main() -> None: + parser = argparse.ArgumentParser(description="Render per-crate .text size as a markdown comment") + parser.add_argument("head_file", help="cargo-bloat --crates JSON for HEAD") + parser.add_argument("--base-file", default=None, help="cargo-bloat --crates JSON for develop") + parser.add_argument("--manifest-path", default="Cargo.toml", help="Workspace Cargo.toml") + parser.add_argument("--target-name", default="datafusion-bench", help="Binary the sizes are measured from") + args = parser.parse_args() + + workspace = workspace_crate_names(args.manifest_path) + head = crate_sizes(args.head_file, workspace) + base = crate_sizes(args.base_file, workspace) if args.base_file else {} + have_base = bool(base) + + total_text = head.pop("__text_section_size__", 0) + base.pop("__text_section_size__", 0) + + crates = sorted(set(head) | set(base)) + rows = [(c, base.get(c, 0), head.get(c, 0), head.get(c, 0) - base.get(c, 0)) for c in crates] + + vortex_head = sum(h for _, _, h, _ in rows) + vortex_base = sum(b for _, b, _, _ in rows) + vortex_delta = vortex_head - vortex_base + + # Largest movers first, then largest crates. + rows.sort(key=lambda r: (abs(r[3]), r[2]), reverse=True) + + if have_base: + if vortex_delta == 0: + summary = f"no code size change ({args.target_name})" + else: + summary = f"code size change {fmt_pct(vortex_base, vortex_head)} ({args.target_name})" + else: + summary = f"code size {fmt_size(vortex_head)} ({args.target_name})" + + # Nothing changed against develop: keep the comment to a single line. + if have_base and vortex_delta == 0: + print(summary) + return + + print("
") + print(f"{summary}") + print("") + print("
") + print("") + if have_base: + print("| Crate | .text | Δ vs develop | % |") + print("|-------|------:|-------------:|--:|") + for name, b, h, d in rows: + print(f"| `{name}` | {fmt_size(h)} | {fmt_delta(d)} | {fmt_pct(b, h)} |") + print("") + print(f"**Vortex total:** {fmt_size(vortex_base)} → {fmt_size(vortex_head)} ({fmt_delta(vortex_delta)})") + else: + print("| Crate | .text | % of Vortex |") + print("|-------|------:|------------:|") + for name, _, h, _ in rows: + pct = f"{h / vortex_head * 100:.1f}%" if vortex_head else "—" + print(f"| `{name}` | {fmt_size(h)} | {pct} |") + print("") + print(f"**Vortex total:** {fmt_size(vortex_head)} of the {fmt_size(total_text)} binary `.text`") + print("") + print("
") + + +if __name__ == "__main__": + main()