From f6b147b66a01c68dcc829550e4e7b177c4abda4a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 17:52:11 +0000 Subject: [PATCH 1/5] Add per-crate code-size report as a sticky PR comment Adds a Code Size CI job that runs tokei over the workspace and posts a single collapsible PR comment: a one-line total in the summary, with the full per-crate line-count breakdown (and deltas against the base) on expand. Nested crates are attributed to their longest path prefix so they are not double counted. Signed-off-by: Joe Isaacs --- .github/workflows/code-size.yml | 56 +++++++++++++++++ scripts/compare-loc.py | 106 ++++++++++++++++++++++++++++++++ scripts/crate-loc.py | 96 +++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 .github/workflows/code-size.yml create mode 100644 scripts/compare-loc.py create mode 100644 scripts/crate-loc.py diff --git a/.github/workflows/code-size.yml b/.github/workflows/code-size.yml new file mode 100644 index 00000000000..7061dd68b63 --- /dev/null +++ b/.github/workflows/code-size.yml @@ -0,0 +1,56 @@ +# Reports Rust lines of code per crate as a sticky PR comment. +# Runs on every push to a pull request and updates a single collapsible comment. + +name: Code Size + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: [pull_request] + +permissions: + contents: read + pull-requests: write # for commenting on PRs + +jobs: + code-size: + name: Per-crate LOC + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout HEAD + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + path: head + ref: ${{ github.event.pull_request.head.sha }} + + - name: Checkout base + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + with: + path: base + ref: ${{ github.event.pull_request.base.sha }} + + - name: Install tokei + uses: taiki-e/install-action@7be9fd86bd1707236395105d6e9329dd1511a7e1 # v2 + with: + tool: tokei + + - name: Compute and compare LOC + shell: bash + run: | + set -Eeu -o pipefail -x + + # Always use HEAD's scripts so the report format is consistent. + python3 head/scripts/crate-loc.py head > head-loc.json + python3 head/scripts/crate-loc.py base > base-loc.json + + python3 head/scripts/compare-loc.py head-loc.json --base-file base-loc.json > comment.md + cat comment.md >> "$GITHUB_STEP_SUMMARY" + + - name: Comment PR + if: github.event.pull_request.head.repo.fork == false + uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3 + with: + file-path: comment.md + comment-tag: code-size-comment diff --git a/scripts/compare-loc.py b/scripts/compare-loc.py new file mode 100644 index 00000000000..7c319816e34 --- /dev/null +++ b/scripts/compare-loc.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Render per-crate lines-of-code as a collapsible markdown comment. + +Takes the JSON produced by ``crate-loc.py`` for HEAD and (optionally) a base +revision, and prints a single ``
`` block: the ```` is a +one-line total so the comment stays compact, and expanding it reveals the full +per-crate table with deltas against the base. +""" + +import argparse +import json + + +def fmt_delta(delta: int) -> str: + """Format a signed line-count delta, or an em dash when unchanged.""" + if delta == 0: + return "—" + return f"{'+' if delta > 0 else '−'}{abs(delta):,}" + + +def fmt_pct(base: int, head: int) -> str: + """Format a percentage change, handling newly added crates.""" + if base == 0: + return "new" if head > 0 else "—" + if head == base: + return "—" + pct = (head / base - 1) * 100 + return f"{'+' if pct > 0 else '−'}{abs(pct):.1f}%" + + +def main() -> None: + parser = argparse.ArgumentParser(description="Render per-crate LOC as a markdown comment") + parser.add_argument("head_file", help="LOC JSON for HEAD") + parser.add_argument("--base-file", help="LOC JSON for the base revision", default=None) + args = parser.parse_args() + + with open(args.head_file) as f: + head = json.load(f) + + base = {} + if args.base_file: + try: + with open(args.base_file) as f: + base = json.load(f) + except FileNotFoundError: + base = {} + have_base = bool(base) + + crates = sorted(set(head) | set(base)) + rows = [] + for crate in crates: + h = head.get(crate, 0) + b = base.get(crate, 0) + rows.append((crate, b, h, h - b)) + + total_head = sum(h for _, _, h, _ in rows) + total_base = sum(b for _, b, _, _ in rows) + total_delta = total_head - total_base + n_crates = sum(1 for _, _, h, _ in rows if h > 0) + + # Largest movers first, then largest crates. + rows.sort(key=lambda r: (abs(r[3]), r[2]), reverse=True) + + if have_base and total_delta != 0: + summary = ( + f"Code size: {total_head:,} lines of Rust across {n_crates} crates " + f"({fmt_delta(total_delta)}, {fmt_pct(total_base, total_head)})" + ) + else: + summary = f"Code size: {total_head:,} lines of Rust across {n_crates} crates" + + print("
") + print(f"{summary}") + print("") + print("
") + print("") + + if have_base: + print("| Crate | Lines | Δ | % |") + print("|-------|------:|--:|--:|") + for crate, b, h, d in rows: + print(f"| `{crate}` | {h:,} | {fmt_delta(d)} | {fmt_pct(b, h)} |") + print("") + print(f"**Total:** {total_base:,} → {total_head:,} ({fmt_delta(total_delta)})") + else: + print("| Crate | Lines |") + print("|-------|------:|") + for crate, _, h, _ in rows: + print(f"| `{crate}` | {h:,} |") + print("") + print(f"**Total:** {total_head:,} lines") + + print("") + print("
") + + +if __name__ == "__main__": + main() diff --git a/scripts/crate-loc.py b/scripts/crate-loc.py new file mode 100644 index 00000000000..aa422d454ba --- /dev/null +++ b/scripts/crate-loc.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Compute Rust lines of code per workspace crate. + +Reads the workspace members from a repository's root ``Cargo.toml``, runs +``tokei`` once over the tree, and attributes each Rust source file to the +crate whose directory is its longest path prefix (so nested crates are not +double counted). Emits a JSON object mapping crate path to code-line count on +stdout. +""" + +import argparse +import json +import subprocess +import tomllib +from pathlib import Path + + +def workspace_crates(repo_root: Path) -> list[str]: + """Return workspace member directories relative to ``repo_root``.""" + with open(repo_root / "Cargo.toml", "rb") as f: + manifest = tomllib.load(f) + + members = manifest.get("workspace", {}).get("members", []) + crates: set[str] = set() + for member in members: + # Members may contain globs such as "encodings/*". + matches = [member] if "*" not in member else [ + str(p.relative_to(repo_root)) for p in sorted(repo_root.glob(member)) + ] + for candidate in matches: + if (repo_root / candidate / "Cargo.toml").is_file(): + crates.add(candidate.replace("\\", "/")) + return sorted(crates) + + +def run_tokei(repo_root: Path) -> dict: + """Run ``tokei`` over ``repo_root`` and return its parsed JSON output.""" + result = subprocess.run( + ["tokei", "--output", "json", "--files", str(repo_root)], + capture_output=True, + text=True, + check=True, + ) + return json.loads(result.stdout) + + +def crate_for(rel_path: str, crate_dirs: list[str]) -> str | None: + """Find the crate whose directory is the longest prefix of ``rel_path``.""" + parts = rel_path.split("/") + for crate in crate_dirs: + crate_parts = crate.split("/") + if parts[: len(crate_parts)] == crate_parts: + return crate + return None + + +def main() -> None: + parser = argparse.ArgumentParser(description="Compute Rust LOC per workspace crate") + parser.add_argument("repo_root", help="Path to the repository root") + args = parser.parse_args() + + repo_root = Path(args.repo_root).resolve() + crates = workspace_crates(repo_root) + # Match the most deeply nested crate first. + crate_dirs = sorted(crates, key=lambda c: c.count("/"), reverse=True) + + tokei = run_tokei(repo_root) + rust = tokei.get("Rust") + if rust is None: + print(json.dumps({crate: 0 for crate in crates})) + return + + loc = {crate: 0 for crate in crates} + for report in rust.get("reports", []): + rel = Path(report["name"]).resolve() + try: + rel_path = rel.relative_to(repo_root).as_posix() + except ValueError: + continue + crate = crate_for(rel_path, crate_dirs) + if crate is not None: + loc[crate] += report["stats"]["code"] + + print(json.dumps(loc, indent=2, sort_keys=True)) + + +if __name__ == "__main__": + main() From b4533d33b2324c67976a9e51acd403906b6f5d98 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:08:00 +0000 Subject: [PATCH 2/5] Add per-crate binary (.text) size report as a sticky PR comment Adds a Crate Binary Size CI job that builds the datafusion-bench binary on stable and runs cargo-bloat to attribute its machine code back to each first-party Vortex crate. Posts a single collapsible PR comment: a one-line Vortex total in the summary, with the full per-crate .text breakdown on expand. Third-party crates (datafusion, arrow, tokio, std) are filtered out using the workspace member set from cargo metadata. Signed-off-by: Joe Isaacs --- .github/workflows/crate-bloat.yml | 50 +++++++++++++++++ scripts/crate-bloat.py | 90 +++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 .github/workflows/crate-bloat.yml create mode 100644 scripts/crate-bloat.py diff --git a/.github/workflows/crate-bloat.yml b/.github/workflows/crate-bloat.yml new file mode 100644 index 00000000000..aad83c12236 --- /dev/null +++ b/.github/workflows/crate-bloat.yml @@ -0,0 +1,50 @@ +# Reports compiled machine-code (.text) size per Vortex crate as a sticky PR comment. +# Builds the datafusion-bench binary on stable and attributes its .text back to +# each first-party crate with cargo-bloat. + +name: Crate Binary Size + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +on: [pull_request] + +permissions: + contents: read + pull-requests: write # for commenting on PRs + +jobs: + bloat: + name: Per-crate .text size + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 + + - uses: ./.github/actions/setup-rust + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Install cargo-bloat + uses: taiki-e/cache-cargo-install-action@66c9585ef5ca780ee69399975a5e911f47905995 + with: + tool: cargo-bloat + + - name: Measure per-crate binary size + shell: bash + run: | + set -Eeu -o pipefail -x + + cargo bloat --package datafusion-bench --bin datafusion-bench \ + --release --crates -n 0 --message-format json > bloat.json + + python3 scripts/crate-bloat.py bloat.json > comment.md + cat comment.md >> "$GITHUB_STEP_SUMMARY" + + - name: Comment PR + if: github.event.pull_request.head.repo.fork == false + uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3 + with: + file-path: comment.md + comment-tag: crate-bloat-comment diff --git a/scripts/crate-bloat.py b/scripts/crate-bloat.py new file mode 100644 index 00000000000..a8298a930cb --- /dev/null +++ b/scripts/crate-bloat.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.11" +# dependencies = [] +# /// + +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +"""Render per-crate compiled (.text) size as a collapsible markdown comment. + +Consumes the JSON produced by ``cargo bloat --crates --message-format json`` +for a linked binary, keeps only first-party workspace crates, and prints a +single ``
`` block: the ```` is a one-line total so the +comment stays compact, and expanding it reveals the full per-crate breakdown +of machine code attributed to each Vortex crate. +""" + +import argparse +import json +import subprocess + + +def fmt_size(size_bytes: int) -> str: + """Format a byte count using binary units.""" + if size_bytes >= 1024**2: + return f"{size_bytes / 1024**2:.2f} MiB" + if size_bytes >= 1024: + return f"{size_bytes / 1024:.1f} KiB" + return f"{size_bytes} B" + + +def workspace_crate_names(manifest_path: str) -> set[str]: + """Return the set of first-party crate names (as cargo-bloat reports them).""" + out = subprocess.run( + ["cargo", "metadata", "--no-deps", "--format-version", "1", "--manifest-path", manifest_path], + capture_output=True, + text=True, + check=True, + ) + metadata = json.loads(out.stdout) + names: set[str] = set() + for pkg in metadata["packages"]: + names.add(pkg["name"].replace("-", "_")) + for target in pkg["targets"]: + names.add(target["name"].replace("-", "_")) + return names + + +def main() -> None: + parser = argparse.ArgumentParser(description="Render per-crate .text size as a markdown comment") + parser.add_argument("bloat_file", help="cargo-bloat --crates JSON output") + parser.add_argument("--manifest-path", default="Cargo.toml", help="Workspace Cargo.toml") + parser.add_argument("--target-name", default="datafusion-bench", help="Binary the sizes are measured from") + args = parser.parse_args() + + with open(args.bloat_file) as f: + bloat = json.load(f) + + workspace = workspace_crate_names(args.manifest_path) + rows = [(c["name"], c["size"]) for c in bloat.get("crates", []) if c["name"] in workspace] + rows.sort(key=lambda r: r[1], reverse=True) + + vortex_text = sum(size for _, size in rows) + total_text = bloat.get("text-section-size", 0) + share = f"{vortex_text / total_text * 100:.0f}%" if total_text else "?" + + summary = ( + f"Binary size ({args.target_name}, release): Vortex crates = {fmt_size(vortex_text)} " + f"of .text across {len(rows)} crates ({share} of the {fmt_size(total_text)} binary)" + ) + + print("
") + print(f"{summary}") + print("") + print("
") + print("") + print("| Crate | .text | % of Vortex |") + print("|-------|------:|------------:|") + for name, size in rows: + pct = f"{size / vortex_text * 100:.1f}%" if vortex_text else "—" + print(f"| `{name}` | {fmt_size(size)} | {pct} |") + print("") + print(f"**Vortex total:** {fmt_size(vortex_text)} of the {fmt_size(total_text)} binary `.text`") + print("") + print("
") + + +if __name__ == "__main__": + main() From f73d56bbd26b39fb76f5b95f75e2fe67822d7863 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:13:30 +0000 Subject: [PATCH 3/5] Compare crate binary size against develop; drop LOC report Builds datafusion-bench for both the PR head and develop on the same runner (reusing the target directory) and reports the per-crate .text delta against develop in the sticky PR comment. Removes the tokei lines-of-code report and its workflow, leaving binary size as the sole code-size metric. Signed-off-by: Joe Isaacs --- .github/workflows/code-size.yml | 56 ---------------- .github/workflows/crate-bloat.yml | 23 +++++-- scripts/compare-loc.py | 106 ----------------------------- scripts/crate-bloat.py | 107 ++++++++++++++++++++++-------- scripts/crate-loc.py | 96 --------------------------- 5 files changed, 99 insertions(+), 289 deletions(-) delete mode 100644 .github/workflows/code-size.yml delete mode 100644 scripts/compare-loc.py delete mode 100644 scripts/crate-loc.py diff --git a/.github/workflows/code-size.yml b/.github/workflows/code-size.yml deleted file mode 100644 index 7061dd68b63..00000000000 --- a/.github/workflows/code-size.yml +++ /dev/null @@ -1,56 +0,0 @@ -# Reports Rust lines of code per crate as a sticky PR comment. -# Runs on every push to a pull request and updates a single collapsible comment. - -name: Code Size - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -on: [pull_request] - -permissions: - contents: read - pull-requests: write # for commenting on PRs - -jobs: - code-size: - name: Per-crate LOC - runs-on: ubuntu-latest - timeout-minutes: 10 - steps: - - name: Checkout HEAD - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - with: - path: head - ref: ${{ github.event.pull_request.head.sha }} - - - name: Checkout base - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 - with: - path: base - ref: ${{ github.event.pull_request.base.sha }} - - - name: Install tokei - uses: taiki-e/install-action@7be9fd86bd1707236395105d6e9329dd1511a7e1 # v2 - with: - tool: tokei - - - name: Compute and compare LOC - shell: bash - run: | - set -Eeu -o pipefail -x - - # Always use HEAD's scripts so the report format is consistent. - python3 head/scripts/crate-loc.py head > head-loc.json - python3 head/scripts/crate-loc.py base > base-loc.json - - python3 head/scripts/compare-loc.py head-loc.json --base-file base-loc.json > comment.md - cat comment.md >> "$GITHUB_STEP_SUMMARY" - - - name: Comment PR - if: github.event.pull_request.head.repo.fork == false - uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3 - with: - file-path: comment.md - comment-tag: code-size-comment diff --git a/.github/workflows/crate-bloat.yml b/.github/workflows/crate-bloat.yml index aad83c12236..91c2e8333d1 100644 --- a/.github/workflows/crate-bloat.yml +++ b/.github/workflows/crate-bloat.yml @@ -31,15 +31,30 @@ jobs: with: tool: cargo-bloat - - name: Measure per-crate binary size + - name: Measure per-crate binary size vs develop shell: bash run: | set -Eeu -o pipefail -x - cargo bloat --package datafusion-bench --bin datafusion-bench \ - --release --crates -n 0 --message-format json > bloat.json + bloat() { + cargo bloat --package datafusion-bench --bin datafusion-bench \ + --release --crates -n 0 --message-format json + } - python3 scripts/crate-bloat.py bloat.json > comment.md + head_sha=$(git rev-parse HEAD) + + # Measure the PR head. + bloat > bloat-head.json + + # Measure develop on the same machine, reusing the target directory so + # only changed crates are rebuilt. + git fetch --no-tags --depth=1 origin develop + git checkout -f FETCH_HEAD + bloat > bloat-develop.json + + # Restore the PR head and render with its copy of the script. + git checkout -f "$head_sha" + python3 scripts/crate-bloat.py bloat-head.json --base-file bloat-develop.json > comment.md cat comment.md >> "$GITHUB_STEP_SUMMARY" - name: Comment PR diff --git a/scripts/compare-loc.py b/scripts/compare-loc.py deleted file mode 100644 index 7c319816e34..00000000000 --- a/scripts/compare-loc.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.11" -# dependencies = [] -# /// - -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -"""Render per-crate lines-of-code as a collapsible markdown comment. - -Takes the JSON produced by ``crate-loc.py`` for HEAD and (optionally) a base -revision, and prints a single ``
`` block: the ```` is a -one-line total so the comment stays compact, and expanding it reveals the full -per-crate table with deltas against the base. -""" - -import argparse -import json - - -def fmt_delta(delta: int) -> str: - """Format a signed line-count delta, or an em dash when unchanged.""" - if delta == 0: - return "—" - return f"{'+' if delta > 0 else '−'}{abs(delta):,}" - - -def fmt_pct(base: int, head: int) -> str: - """Format a percentage change, handling newly added crates.""" - if base == 0: - return "new" if head > 0 else "—" - if head == base: - return "—" - pct = (head / base - 1) * 100 - return f"{'+' if pct > 0 else '−'}{abs(pct):.1f}%" - - -def main() -> None: - parser = argparse.ArgumentParser(description="Render per-crate LOC as a markdown comment") - parser.add_argument("head_file", help="LOC JSON for HEAD") - parser.add_argument("--base-file", help="LOC JSON for the base revision", default=None) - args = parser.parse_args() - - with open(args.head_file) as f: - head = json.load(f) - - base = {} - if args.base_file: - try: - with open(args.base_file) as f: - base = json.load(f) - except FileNotFoundError: - base = {} - have_base = bool(base) - - crates = sorted(set(head) | set(base)) - rows = [] - for crate in crates: - h = head.get(crate, 0) - b = base.get(crate, 0) - rows.append((crate, b, h, h - b)) - - total_head = sum(h for _, _, h, _ in rows) - total_base = sum(b for _, b, _, _ in rows) - total_delta = total_head - total_base - n_crates = sum(1 for _, _, h, _ in rows if h > 0) - - # Largest movers first, then largest crates. - rows.sort(key=lambda r: (abs(r[3]), r[2]), reverse=True) - - if have_base and total_delta != 0: - summary = ( - f"Code size: {total_head:,} lines of Rust across {n_crates} crates " - f"({fmt_delta(total_delta)}, {fmt_pct(total_base, total_head)})" - ) - else: - summary = f"Code size: {total_head:,} lines of Rust across {n_crates} crates" - - print("
") - print(f"{summary}") - print("") - print("
") - print("") - - if have_base: - print("| Crate | Lines | Δ | % |") - print("|-------|------:|--:|--:|") - for crate, b, h, d in rows: - print(f"| `{crate}` | {h:,} | {fmt_delta(d)} | {fmt_pct(b, h)} |") - print("") - print(f"**Total:** {total_base:,} → {total_head:,} ({fmt_delta(total_delta)})") - else: - print("| Crate | Lines |") - print("|-------|------:|") - for crate, _, h, _ in rows: - print(f"| `{crate}` | {h:,} |") - print("") - print(f"**Total:** {total_head:,} lines") - - print("") - print("
") - - -if __name__ == "__main__": - main() diff --git a/scripts/crate-bloat.py b/scripts/crate-bloat.py index a8298a930cb..e9f82b5c5e2 100644 --- a/scripts/crate-bloat.py +++ b/scripts/crate-bloat.py @@ -11,9 +11,10 @@ Consumes the JSON produced by ``cargo bloat --crates --message-format json`` for a linked binary, keeps only first-party workspace crates, and prints a -single ``
`` block: the ```` is a one-line total so the -comment stays compact, and expanding it reveals the full per-crate breakdown -of machine code attributed to each Vortex crate. +single ``
`` block: the ```` is a one-line Vortex total so the +comment stays compact, and expanding it reveals the full per-crate breakdown of +machine code attributed to each Vortex crate, with deltas against ``develop`` +when a base measurement is provided. """ import argparse @@ -23,13 +24,30 @@ def fmt_size(size_bytes: int) -> str: """Format a byte count using binary units.""" - if size_bytes >= 1024**2: + if abs(size_bytes) >= 1024**2: return f"{size_bytes / 1024**2:.2f} MiB" - if size_bytes >= 1024: + if abs(size_bytes) >= 1024: return f"{size_bytes / 1024:.1f} KiB" return f"{size_bytes} B" +def fmt_delta(delta: int) -> str: + """Format a signed size delta, or an em dash when unchanged.""" + if delta == 0: + return "—" + return f"{'+' if delta > 0 else '−'}{fmt_size(abs(delta))}" + + +def fmt_pct(base: int, head: int) -> str: + """Format a percentage change, handling newly introduced crates.""" + if base == 0: + return "new" if head > 0 else "—" + if head == base: + return "—" + pct = (head / base - 1) * 100 + return f"{'+' if pct > 0 else '−'}{abs(pct):.1f}%" + + def workspace_crate_names(manifest_path: str) -> set[str]: """Return the set of first-party crate names (as cargo-bloat reports them).""" out = subprocess.run( @@ -47,41 +65,76 @@ def workspace_crate_names(manifest_path: str) -> set[str]: return names +def crate_sizes(bloat_file: str, workspace: set[str]) -> dict[str, int]: + """Load a cargo-bloat JSON file and keep only first-party crate sizes.""" + with open(bloat_file) as f: + bloat = json.load(f) + sizes = {c["name"]: c["size"] for c in bloat.get("crates", []) if c["name"] in workspace} + sizes["__text_section_size__"] = bloat.get("text-section-size", 0) + return sizes + + def main() -> None: parser = argparse.ArgumentParser(description="Render per-crate .text size as a markdown comment") - parser.add_argument("bloat_file", help="cargo-bloat --crates JSON output") + parser.add_argument("head_file", help="cargo-bloat --crates JSON for HEAD") + parser.add_argument("--base-file", default=None, help="cargo-bloat --crates JSON for develop") parser.add_argument("--manifest-path", default="Cargo.toml", help="Workspace Cargo.toml") parser.add_argument("--target-name", default="datafusion-bench", help="Binary the sizes are measured from") args = parser.parse_args() - with open(args.bloat_file) as f: - bloat = json.load(f) - workspace = workspace_crate_names(args.manifest_path) - rows = [(c["name"], c["size"]) for c in bloat.get("crates", []) if c["name"] in workspace] - rows.sort(key=lambda r: r[1], reverse=True) - - vortex_text = sum(size for _, size in rows) - total_text = bloat.get("text-section-size", 0) - share = f"{vortex_text / total_text * 100:.0f}%" if total_text else "?" - - summary = ( - f"Binary size ({args.target_name}, release): Vortex crates = {fmt_size(vortex_text)} " - f"of .text across {len(rows)} crates ({share} of the {fmt_size(total_text)} binary)" - ) + head = crate_sizes(args.head_file, workspace) + base = crate_sizes(args.base_file, workspace) if args.base_file else {} + have_base = bool(base) + + total_text = head.pop("__text_section_size__", 0) + base.pop("__text_section_size__", 0) + + crates = sorted(set(head) | set(base)) + rows = [(c, base.get(c, 0), head.get(c, 0), head.get(c, 0) - base.get(c, 0)) for c in crates] + + vortex_head = sum(h for _, _, h, _ in rows) + vortex_base = sum(b for _, b, _, _ in rows) + vortex_delta = vortex_head - vortex_base + n_crates = sum(1 for _, _, h, _ in rows if h > 0) + share = f"{vortex_head / total_text * 100:.0f}%" if total_text else "?" + + # Largest movers first, then largest crates. + rows.sort(key=lambda r: (abs(r[3]), r[2]), reverse=True) + + if have_base and vortex_delta != 0: + summary = ( + f"Binary size ({args.target_name}, release vs develop): Vortex crates = " + f"{fmt_size(vortex_head)} of .text across {n_crates} crates " + f"({fmt_delta(vortex_delta)}, {fmt_pct(vortex_base, vortex_head)}, {share} of binary)" + ) + else: + suffix = " vs develop: no change" if have_base else "" + summary = ( + f"Binary size ({args.target_name}, release): Vortex crates = {fmt_size(vortex_head)} " + f"of .text across {n_crates} crates ({share} of binary){suffix}" + ) print("
") print(f"{summary}") print("") print("
") print("") - print("| Crate | .text | % of Vortex |") - print("|-------|------:|------------:|") - for name, size in rows: - pct = f"{size / vortex_text * 100:.1f}%" if vortex_text else "—" - print(f"| `{name}` | {fmt_size(size)} | {pct} |") - print("") - print(f"**Vortex total:** {fmt_size(vortex_text)} of the {fmt_size(total_text)} binary `.text`") + if have_base: + print("| Crate | .text | Δ vs develop | % |") + print("|-------|------:|-------------:|--:|") + for name, b, h, d in rows: + print(f"| `{name}` | {fmt_size(h)} | {fmt_delta(d)} | {fmt_pct(b, h)} |") + print("") + print(f"**Vortex total:** {fmt_size(vortex_base)} → {fmt_size(vortex_head)} ({fmt_delta(vortex_delta)})") + else: + print("| Crate | .text | % of Vortex |") + print("|-------|------:|------------:|") + for name, _, h, _ in rows: + pct = f"{h / vortex_head * 100:.1f}%" if vortex_head else "—" + print(f"| `{name}` | {fmt_size(h)} | {pct} |") + print("") + print(f"**Vortex total:** {fmt_size(vortex_head)} of the {fmt_size(total_text)} binary `.text`") print("") print("
") diff --git a/scripts/crate-loc.py b/scripts/crate-loc.py deleted file mode 100644 index aa422d454ba..00000000000 --- a/scripts/crate-loc.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.11" -# dependencies = [] -# /// - -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -"""Compute Rust lines of code per workspace crate. - -Reads the workspace members from a repository's root ``Cargo.toml``, runs -``tokei`` once over the tree, and attributes each Rust source file to the -crate whose directory is its longest path prefix (so nested crates are not -double counted). Emits a JSON object mapping crate path to code-line count on -stdout. -""" - -import argparse -import json -import subprocess -import tomllib -from pathlib import Path - - -def workspace_crates(repo_root: Path) -> list[str]: - """Return workspace member directories relative to ``repo_root``.""" - with open(repo_root / "Cargo.toml", "rb") as f: - manifest = tomllib.load(f) - - members = manifest.get("workspace", {}).get("members", []) - crates: set[str] = set() - for member in members: - # Members may contain globs such as "encodings/*". - matches = [member] if "*" not in member else [ - str(p.relative_to(repo_root)) for p in sorted(repo_root.glob(member)) - ] - for candidate in matches: - if (repo_root / candidate / "Cargo.toml").is_file(): - crates.add(candidate.replace("\\", "/")) - return sorted(crates) - - -def run_tokei(repo_root: Path) -> dict: - """Run ``tokei`` over ``repo_root`` and return its parsed JSON output.""" - result = subprocess.run( - ["tokei", "--output", "json", "--files", str(repo_root)], - capture_output=True, - text=True, - check=True, - ) - return json.loads(result.stdout) - - -def crate_for(rel_path: str, crate_dirs: list[str]) -> str | None: - """Find the crate whose directory is the longest prefix of ``rel_path``.""" - parts = rel_path.split("/") - for crate in crate_dirs: - crate_parts = crate.split("/") - if parts[: len(crate_parts)] == crate_parts: - return crate - return None - - -def main() -> None: - parser = argparse.ArgumentParser(description="Compute Rust LOC per workspace crate") - parser.add_argument("repo_root", help="Path to the repository root") - args = parser.parse_args() - - repo_root = Path(args.repo_root).resolve() - crates = workspace_crates(repo_root) - # Match the most deeply nested crate first. - crate_dirs = sorted(crates, key=lambda c: c.count("/"), reverse=True) - - tokei = run_tokei(repo_root) - rust = tokei.get("Rust") - if rust is None: - print(json.dumps({crate: 0 for crate in crates})) - return - - loc = {crate: 0 for crate in crates} - for report in rust.get("reports", []): - rel = Path(report["name"]).resolve() - try: - rel_path = rel.relative_to(repo_root).as_posix() - except ValueError: - continue - crate = crate_for(rel_path, crate_dirs) - if crate is not None: - loc[crate] += report["stats"]["code"] - - print(json.dumps(loc, indent=2, sort_keys=True)) - - -if __name__ == "__main__": - main() From ff7a5fc93897ca8704c654e3c489b286437dda98 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:32:07 +0000 Subject: [PATCH 4/5] Collapse crate binary-size comment to one line when unchanged vs develop When the PR introduces no .text delta against develop, emit just the summary line instead of the full per-crate table to keep the comment quiet. Signed-off-by: Joe Isaacs --- scripts/crate-bloat.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/crate-bloat.py b/scripts/crate-bloat.py index e9f82b5c5e2..6dd14e81f40 100644 --- a/scripts/crate-bloat.py +++ b/scripts/crate-bloat.py @@ -115,6 +115,11 @@ def main() -> None: f"of .text across {n_crates} crates ({share} of binary){suffix}" ) + # Nothing changed against develop: keep the comment to a single line. + if have_base and vortex_delta == 0: + print(summary) + return + print("
") print(f"{summary}") print("") From 152f7d34fcb866c7b753b10beebcb2814144c5f0 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:34:29 +0000 Subject: [PATCH 5/5] Shorten crate binary-size summary line Collapsed line now reads "no code size change (datafusion-bench)" when unchanged, or "code size change +/-XX% (datafusion-bench)" with the per-crate details table on expand when it changed. Signed-off-by: Joe Isaacs --- scripts/crate-bloat.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/scripts/crate-bloat.py b/scripts/crate-bloat.py index 6dd14e81f40..da764f78019 100644 --- a/scripts/crate-bloat.py +++ b/scripts/crate-bloat.py @@ -96,24 +96,17 @@ def main() -> None: vortex_head = sum(h for _, _, h, _ in rows) vortex_base = sum(b for _, b, _, _ in rows) vortex_delta = vortex_head - vortex_base - n_crates = sum(1 for _, _, h, _ in rows if h > 0) - share = f"{vortex_head / total_text * 100:.0f}%" if total_text else "?" # Largest movers first, then largest crates. rows.sort(key=lambda r: (abs(r[3]), r[2]), reverse=True) - if have_base and vortex_delta != 0: - summary = ( - f"Binary size ({args.target_name}, release vs develop): Vortex crates = " - f"{fmt_size(vortex_head)} of .text across {n_crates} crates " - f"({fmt_delta(vortex_delta)}, {fmt_pct(vortex_base, vortex_head)}, {share} of binary)" - ) + if have_base: + if vortex_delta == 0: + summary = f"no code size change ({args.target_name})" + else: + summary = f"code size change {fmt_pct(vortex_base, vortex_head)} ({args.target_name})" else: - suffix = " vs develop: no change" if have_base else "" - summary = ( - f"Binary size ({args.target_name}, release): Vortex crates = {fmt_size(vortex_head)} " - f"of .text across {n_crates} crates ({share} of binary){suffix}" - ) + summary = f"code size {fmt_size(vortex_head)} ({args.target_name})" # Nothing changed against develop: keep the comment to a single line. if have_base and vortex_delta == 0: