Blosc · FrancescAlted · Apr 17, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/bench/ctable/bench_append_regression.py b/bench/ctable/bench_append_regression.py
@@ -0,0 +1,117 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: append() overhead introduced by the new schema pipeline
+#
+# The new append() path routes every row through:
+#   _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage
+#
+# This benchmark isolates how much each step costs, and shows the
+# total overhead vs the raw NDArray write speed.
+
+from dataclasses import dataclass
+from time import perf_counter
+
+import numpy as np
+
+import blosc2
+from blosc2.schema_compiler import compile_schema
+from blosc2.schema_validation import build_validator_model, validate_row
+
+
+@dataclass
+class Row:
+    id:     int   = blosc2.field(blosc2.int64(ge=0))
+    score:  float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
+    active: bool  = blosc2.field(blosc2.bool(), default=True)
+
+
+N = 5_000
+rng = np.random.default_rng(42)
+data = [
+    (int(i), float(rng.uniform(0, 100)), bool(i % 2))
+    for i in range(N)
+]
+schema = compile_schema(Row)
+# Warm up the Pydantic model cache
+build_validator_model(schema)
+
+print(f"append() pipeline cost breakdown  |  N = {N:,} rows")
+print("=" * 60)
+
+# ── 1. Raw NDArray writes (no CTable overhead at all) ────────────────────────
+ids    = np.zeros(N, dtype=np.int64)
+scores = np.zeros(N, dtype=np.float64)
+flags  = np.zeros(N, dtype=np.bool_)
+mask   = np.zeros(N, dtype=np.bool_)
+
+t0 = perf_counter()
+for i, (id_, score, active) in enumerate(data):
+    ids[i]    = id_
+    scores[i] = score
+    flags[i]  = active
+    mask[i]   = True
+t_raw = perf_counter() - t0
+print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s")
+
+# ── 2. _normalize_row_input only ─────────────────────────────────────────────
+t_obj = blosc2.CTable(Row, expected_size=N, validate=False)
+t0 = perf_counter()
+for row in data:
+    _ = t_obj._normalize_row_input(row)
+t_normalize = perf_counter() - t0
+print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s  ({t_normalize/t_raw:.1f}x baseline)")
+
+# ── 3. Pydantic validate_row only ────────────────────────────────────────────
+row_dicts = [t_obj._normalize_row_input(row) for row in data]
+t0 = perf_counter()
+for rd in row_dicts:
+    _ = validate_row(schema, rd)
+t_validate = perf_counter() - t0
+print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s  ({t_validate/t_raw:.1f}x baseline)")
+
+# ── 4. _coerce_row_to_storage only ───────────────────────────────────────────
+t0 = perf_counter()
+for rd in row_dicts:
+    _ = t_obj._coerce_row_to_storage(rd)
+t_coerce = perf_counter() - t0
+print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s  ({t_coerce/t_raw:.1f}x baseline)")
+
+# ── 5. Full append(), validate=False  (3 runs, take minimum) ─────────────────
+RUNS = 3
+best_off = float("inf")
+for _ in range(RUNS):
+    t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False)
+    t0 = perf_counter()
+    for row in data:
+        t_obj2.append(row)
+    best_off = min(best_off, perf_counter() - t0)
+t_append_off = best_off
+print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s  ({t_append_off/t_raw:.1f}x baseline)")
+
+# ── 6. Full append(), validate=True  (3 runs, take minimum) ──────────────────
+best_on = float("inf")
+for _ in range(RUNS):
+    t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True)
+    t0 = perf_counter()
+    for row in data:
+        t_obj3.append(row)
+    best_on = min(best_on, perf_counter() - t0)
+t_append_on = best_on
+print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s  ({t_append_on/t_raw:.1f}x baseline)")
+
+print()
+print("=" * 60)
+pydantic_cost = max(t_append_on - t_append_off, 0.0)
+print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s")
+if t_append_on > 0:
+    print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%")
+print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row")
+print()
+print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),")
+print("      not by the validation pipeline.")
+print("      The main bottleneck is the last_true_pos backward scan per row.")
diff --git a/bench/ctable/bench_pandas_roundtrip.py b/bench/ctable/bench_pandas_roundtrip.py
@@ -0,0 +1,209 @@
+#######################################################################
+# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+#######################################################################
+
+# Benchmark: pandas ↔ CTable round-trip (with on-disk persistence)
+#
+# Pipeline measured in four isolated steps:
+#
+#   1. pandas → CTable   : DataFrame.to_arrow() + CTable.from_arrow()
+#   2. CTable.save()     : write in-memory CTable to disk
+#   3. CTable.load()     : read disk table back into RAM
+#   4. CTable → pandas   : CTable.to_arrow().to_pandas()
+#
+# Plus the combined full round-trip (steps 1-4) is shown at the end.
+#
+# Each measurement is the minimum of NRUNS repetitions to reduce noise.
+# Schema: id (int64), score (float64), active (bool), label (string ≤16).
+
+import os
+import shutil
+from time import perf_counter
+
+import numpy as np
+import pandas as pd
+import pyarrow as pa
+
+from blosc2 import CTable
+
+NRUNS = 3
+TABLE_DIR = "saved_ctable/bench_pandas"
+SIZES = [1_000, 10_000, 100_000, 1_000_000]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def sep(title: str) -> None:
+    print(f"\n{'─' * 60}")
+    print(f"  {title}")
+    print(f"{'─' * 60}")
+
+
+def tmin(fn, n: int = NRUNS) -> float:
+    """Minimum elapsed time (s) over *n* calls of *fn*."""
+    best = float("inf")
+    for _ in range(n):
+        t0 = perf_counter()
+        fn()
+        best = min(best, perf_counter() - t0)
+    return best
+
+
+def clean(path: str = TABLE_DIR) -> None:
+    if os.path.exists(path):
+        shutil.rmtree(path)
+    os.makedirs(path, exist_ok=True)
+
+
+def make_dataframe(n: int) -> pd.DataFrame:
+    rng = np.random.default_rng(42)
+    return pd.DataFrame({
+        "id":     np.arange(n, dtype=np.int64),
+        "score":  rng.uniform(0, 100, n).astype(np.float64),
+        "active": rng.integers(0, 2, n, dtype=bool),
+        "label":  [f"r{i % 10000:05d}" for i in range(n)],
+    })
+
+
+# ---------------------------------------------------------------------------
+# Section 1: pandas → CTable  (in-memory)
+# ---------------------------------------------------------------------------
+
+sep("1. pandas → CTable  (from_arrow, in-memory)")
+print(f"{'rows':>12}  {'pandas→arrow (s)':>18}  {'arrow→ctable (s)':>18}  {'total (s)':>12}")
+print(f"{'----':>12}  {'----------------':>18}  {'----------------':>18}  {'---------':>12}")
+
+ctables: dict[int, CTable] = {}  # keep for steps 2 & 4
+
+for N in SIZES:
+    df = make_dataframe(N)
+
+    def bench_to_arrow(df=df):
+        return pa.Table.from_pandas(df, preserve_index=False)
+
+    def bench_from_arrow(df=df):
+        at = pa.Table.from_pandas(df, preserve_index=False)
+        return CTable.from_arrow(at)
+
+    t_pa  = tmin(bench_to_arrow)
+    t_ct  = tmin(bench_from_arrow) - t_pa   # from_arrow only
+    t_tot = t_pa + t_ct
+
+    # Keep one CTable for later steps
+    at = pa.Table.from_pandas(df, preserve_index=False)
+    ctables[N] = CTable.from_arrow(at)
+
+    print(f"{N:>12,}  {t_pa:>18.4f}  {t_ct:>18.4f}  {t_tot:>12.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 2: CTable.save()  (in-memory → disk)
+# ---------------------------------------------------------------------------
+
+sep("2. CTable.save()  (in-memory → disk)")
+print(f"{'rows':>12}  {'save (s)':>14}  {'compressed':>12}  {'ratio':>8}")
+print(f"{'----':>12}  {'--------':>14}  {'----------':>12}  {'-----':>8}")
+
+for N in SIZES:
+    t = ctables[N]
+    path = os.path.join(TABLE_DIR, f"ct_{N}")
+
+    def bench_save(t=t, path=path):
+        if os.path.exists(path):
+            shutil.rmtree(path)
+        t.save(path, overwrite=True)
+
+    elapsed = tmin(bench_save)
+    # Final state for size info
+    t.save(path, overwrite=True)
+    cbytes = t.cbytes
+    nbytes = t.nbytes
+    ratio  = nbytes / cbytes if cbytes > 0 else float("nan")
+
+    def _fmt(n):
+        if n < 1024**2:
+            return f"{n / 1024:.1f} KB"
+        return f"{n / 1024**2:.1f} MB"
+
+    print(f"{N:>12,}  {elapsed:>14.4f}  {_fmt(cbytes):>12}  {ratio:>7.2f}x")
+
+
+# ---------------------------------------------------------------------------
+# Section 3: CTable.load()  (disk → in-memory)
+# ---------------------------------------------------------------------------
+
+sep("3. CTable.load()  (disk → in-memory)")
+print(f"{'rows':>12}  {'load (s)':>14}")
+print(f"{'----':>12}  {'--------':>14}")
+
+for N in SIZES:
+    path = os.path.join(TABLE_DIR, f"ct_{N}")
+
+    def bench_load(path=path):
+        return CTable.load(path)
+
+    elapsed = tmin(bench_load)
+    print(f"{N:>12,}  {elapsed:>14.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 4: CTable → pandas  (to_arrow → to_pandas)
+# ---------------------------------------------------------------------------
+
+sep("4. CTable → pandas  (to_arrow + to_pandas)")
+print(f"{'rows':>12}  {'ctable→arrow (s)':>18}  {'arrow→pandas (s)':>18}  {'total (s)':>12}")
+print(f"{'----':>12}  {'----------------':>18}  {'----------------':>18}  {'---------':>12}")
+
+for N in SIZES:
+    t = ctables[N]
+    at_cache = t.to_arrow()  # pre-convert once so we can time each step cleanly
+
+    def bench_to_arrow_ct(t=t):
+        return t.to_arrow()
+
+    def bench_to_pandas(at=at_cache):
+        return at.to_pandas()
+
+    t_arr = tmin(bench_to_arrow_ct)
+    t_pd  = tmin(bench_to_pandas)
+    t_tot = t_arr + t_pd
+
+    print(f"{N:>12,}  {t_arr:>18.4f}  {t_pd:>18.4f}  {t_tot:>12.4f}")
+
+
+# ---------------------------------------------------------------------------
+# Section 5: Full round-trip  (pandas → CTable → disk → load → pandas)
+# ---------------------------------------------------------------------------
+
+sep("5. Full round-trip  (pandas → CTable → save → load → pandas)")
+print(f"{'rows':>12}  {'round-trip (s)':>16}")
+print(f"{'----':>12}  {'---------------':>16}")
+
+for N in SIZES:
+    df = make_dataframe(N)
+    path = os.path.join(TABLE_DIR, f"rt_{N}")
+
+    def bench_roundtrip(df=df, path=path):
+        # pandas → CTable
+        at = pa.Table.from_pandas(df, preserve_index=False)
+        t = CTable.from_arrow(at)
+        # save to disk
+        t.save(path, overwrite=True)
+        # load back
+        t2 = CTable.load(path)
+        # CTable → pandas
+        return t2.to_arrow().to_pandas()
+
+    elapsed = tmin(bench_roundtrip)
+    print(f"{N:>12,}  {elapsed:>16.4f}")
+
+
+# Cleanup
+clean()
+print()