Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
6de7c30
feat: add CTable, a columnar in-memory table built on top of blosc2
Jacc4224 Mar 26, 2026
01e47f4
Merge pull request #604 from Jacc4224/ctable-new
FrancescAlted Mar 26, 2026
c05c2ec
Add a plan for declaring a simple schema for CTable objects
FrancescAlted Mar 26, 2026
725c28b
Add a pydantic as a new dependency
FrancescAlted Mar 26, 2026
0efd450
Fix small formatting issues
FrancescAlted Mar 26, 2026
f504ad0
Simplify the plan for ctable schema
FrancescAlted Mar 26, 2026
46bf2e3
Disable wheel generation for each commit in this branch
FrancescAlted Mar 26, 2026
43bf562
Add a new plan on CTable persistence
FrancescAlted Mar 26, 2026
e84f7ac
_
Jacc4224 Mar 26, 2026
8de1870
_
Jacc4224 Mar 26, 2026
a8db18d
Testing
FrancescAlted Mar 26, 2026
dd154b1
Merge branch 'ctable3' of github.com:Blosc/python-blosc2 into my_ctable3
Jacc4224 Mar 26, 2026
ce65607
writen test
Jacc4224 Mar 26, 2026
b623f0e
Remove testing file
FrancescAlted Mar 26, 2026
b9e8c35
Merge branch 'ctable3' of github.com:Blosc/python-blosc2 into my_ctable3
Jacc4224 Mar 26, 2026
4ce8296
Schema layer:
Jacc4224 Apr 4, 2026
ee1d0c4
persistency half way done
Jacc4224 Apr 4, 2026
a422d72
CTable: full feature build-out (persistency, aggregates, mutations, …
Jacc4224 Apr 6, 2026
0472b3f
CTable: full feature build-out (persistency, aggregates, mutations, …
Jacc4224 Apr 6, 2026
34f8219
CSV compatibility implementation
Jacc4224 Apr 7, 2026
6bf1ec8
Persistent ctables.
Jacc4224 Apr 7, 2026
34c2eee
Colision bug fixed 1
Jacc4224 Apr 7, 2026
a3852b6
Merge pull request #614 from Jacc4224/my_ctable3
FrancescAlted Apr 8, 2026
14853ac
Remove large data files from repo
FrancescAlted Apr 8, 2026
66e35a4
Restore CI files from main
FrancescAlted Apr 8, 2026
0dc8697
Restore compatibility with numpy < 2
FrancescAlted Apr 8, 2026
457b0ff
Back CTable persistence with TreeStore and materialize it via blosc2.…
FrancescAlted Apr 15, 2026
5fc16b7
Relax DictStore and TreeStore path suffix requirements
FrancescAlted Apr 15, 2026
f7cd02e
Update CTable docs, examples, and benchmarks for TreeStore-backed per…
FrancescAlted Apr 15, 2026
465e855
Move store extension doc to plans/
FrancescAlted Apr 15, 2026
71d3240
Merge branch 'main' into ctable4
FrancescAlted Apr 15, 2026
d3148a1
Accelerate blosc2.open by trying the standard open first
FrancescAlted Apr 15, 2026
41f7a14
Nullable atribute in schema.
Jacc4224 Apr 15, 2026
bc4d4ff
Fix issues when array is a numpy array, not blosc2
FrancescAlted Apr 15, 2026
fcb9efa
For large temp arange arrays, use blosc2.arange instead of np.arange
FrancescAlted Apr 15, 2026
eaccd53
Shaving test suite run time by a little bit
FrancescAlted Apr 15, 2026
04f2577
Merge branch 'ctable4' of github.com:Blosc/python-blosc2 into ctable4
FrancescAlted Apr 15, 2026
2ff3140
Add persistent index support to CTable
FrancescAlted Apr 15, 2026
c44031d
Implement CTable indexing follow-ups
FrancescAlted Apr 15, 2026
fb66107
New InfoReporter for CTable. Example on how to use a .b2z file.
FrancescAlted Apr 15, 2026
2cb4295
New Column.__repr__() for an nice overview of the column
FrancescAlted Apr 15, 2026
efbba8a
Warn on implicit blosc2.open append mode
FrancescAlted Apr 15, 2026
f8021ae
Fancier CTable.info printed representation
FrancescAlted Apr 15, 2026
17f12c6
Add a TODO for removing FutureWarning path once blosc2.open() default…
FrancescAlted Apr 15, 2026
8dddc8c
Fix a regression when reopening a persisted Proxy with mode='r'
FrancescAlted Apr 15, 2026
6e75a47
Merge pull request #620 from Blosc/ctable-indexing
FrancescAlted Apr 15, 2026
034c0bf
Fix .b2z double-open corruption caused by GC-triggered repacking
FrancescAlted Apr 16, 2026
e58b4c7
Temporarily unpacking a .b2z file defaults now to the same dir as the…
FrancescAlted Apr 16, 2026
2b3eeff
Fix nullable validation, chunk sizing, print alignment, numpy mask su…
Jacc4224 Apr 16, 2026
8496c11
Merge branch 'ctable4' into my_ctable3
Jacc4224 Apr 16, 2026
4f509cf
Merge pull request #619 from Jacc4224/my_ctable3
FrancescAlted Apr 16, 2026
8d4603b
Fix some issues in tests
FrancescAlted Apr 16, 2026
e5e75d9
Merge branch 'main' into ctable4
FrancescAlted Apr 16, 2026
3fafab1
Fix GC-induced thread hang on macOS with Python 3.14
FrancescAlted Apr 16, 2026
7defba4
Break reference cycles and harden async chunk reader
FrancescAlted Apr 16, 2026
115985e
Break reference cycles and purge stale caches in ctable layer
FrancescAlted Apr 16, 2026
468177b
Fix for a re-entrant cache-cleanup bug in indexing code
FrancescAlted Apr 17, 2026
2fbd41a
Make test deterministic and closer to what it’s actually trying to ve…
FrancescAlted Apr 17, 2026
485ac50
Fix ruff warnings
FrancescAlted Apr 17, 2026
b407f92
Merge branch 'main' into ctable4
FrancescAlted Apr 17, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 117 additions & 0 deletions bench/ctable/bench_append_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark: append() overhead introduced by the new schema pipeline
#
# The new append() path routes every row through:
# _normalize_row_input → validate_row (Pydantic) → _coerce_row_to_storage
#
# This benchmark isolates how much each step costs, and shows the
# total overhead vs the raw NDArray write speed.

from dataclasses import dataclass
from time import perf_counter

import numpy as np

import blosc2
from blosc2.schema_compiler import compile_schema
from blosc2.schema_validation import build_validator_model, validate_row


@dataclass
class Row:
id: int = blosc2.field(blosc2.int64(ge=0))
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
active: bool = blosc2.field(blosc2.bool(), default=True)


N = 5_000
rng = np.random.default_rng(42)
data = [
(int(i), float(rng.uniform(0, 100)), bool(i % 2))
for i in range(N)
]
schema = compile_schema(Row)
# Warm up the Pydantic model cache
build_validator_model(schema)

print(f"append() pipeline cost breakdown | N = {N:,} rows")
print("=" * 60)

# ── 1. Raw NDArray writes (no CTable overhead at all) ────────────────────────
ids = np.zeros(N, dtype=np.int64)
scores = np.zeros(N, dtype=np.float64)
flags = np.zeros(N, dtype=np.bool_)
mask = np.zeros(N, dtype=np.bool_)

t0 = perf_counter()
for i, (id_, score, active) in enumerate(data):
ids[i] = id_
scores[i] = score
flags[i] = active
mask[i] = True
t_raw = perf_counter() - t0
print(f"{'Raw NumPy writes (baseline)':<40} {t_raw:.4f} s")

# ── 2. _normalize_row_input only ─────────────────────────────────────────────
t_obj = blosc2.CTable(Row, expected_size=N, validate=False)
t0 = perf_counter()
for row in data:
_ = t_obj._normalize_row_input(row)
t_normalize = perf_counter() - t0
print(f"{'_normalize_row_input only':<40} {t_normalize:.4f} s ({t_normalize/t_raw:.1f}x baseline)")

# ── 3. Pydantic validate_row only ────────────────────────────────────────────
row_dicts = [t_obj._normalize_row_input(row) for row in data]
t0 = perf_counter()
for rd in row_dicts:
_ = validate_row(schema, rd)
t_validate = perf_counter() - t0
print(f"{'validate_row (Pydantic) only':<40} {t_validate:.4f} s ({t_validate/t_raw:.1f}x baseline)")

# ── 4. _coerce_row_to_storage only ───────────────────────────────────────────
t0 = perf_counter()
for rd in row_dicts:
_ = t_obj._coerce_row_to_storage(rd)
t_coerce = perf_counter() - t0
print(f"{'_coerce_row_to_storage only':<40} {t_coerce:.4f} s ({t_coerce/t_raw:.1f}x baseline)")

# ── 5. Full append(), validate=False (3 runs, take minimum) ─────────────────
RUNS = 3
best_off = float("inf")
for _ in range(RUNS):
t_obj2 = blosc2.CTable(Row, expected_size=N, validate=False)
t0 = perf_counter()
for row in data:
t_obj2.append(row)
best_off = min(best_off, perf_counter() - t0)
t_append_off = best_off
print(f"{'Full append(), validate=False':<40} {t_append_off:.4f} s ({t_append_off/t_raw:.1f}x baseline)")

# ── 6. Full append(), validate=True (3 runs, take minimum) ──────────────────
best_on = float("inf")
for _ in range(RUNS):
t_obj3 = blosc2.CTable(Row, expected_size=N, validate=True)
t0 = perf_counter()
for row in data:
t_obj3.append(row)
best_on = min(best_on, perf_counter() - t0)
t_append_on = best_on
print(f"{'Full append(), validate=True':<40} {t_append_on:.4f} s ({t_append_on/t_raw:.1f}x baseline)")

print()
print("=" * 60)
pydantic_cost = max(t_append_on - t_append_off, 0.0)
print(f"{'Pydantic overhead in append()':<40} {pydantic_cost:.4f} s")
if t_append_on > 0:
print(f"{'Validation fraction of total':<40} {pydantic_cost/t_append_on*100:.1f}%")
print(f"{'Per-row Pydantic cost (isolated)':<40} {(t_validate/N)*1e6:.2f} µs/row")
print()
print(f"Note: append() is dominated by blosc2 I/O ({t_append_off/t_raw:.0f}x raw numpy),")
print(" not by the validation pipeline.")
print(" The main bottleneck is the last_true_pos backward scan per row.")
209 changes: 209 additions & 0 deletions bench/ctable/bench_pandas_roundtrip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#######################################################################
# Copyright (c) 2019-present, Blosc Development Team <blosc@blosc.org>
# All rights reserved.
#
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark: pandas ↔ CTable round-trip (with on-disk persistence)
#
# Pipeline measured in four isolated steps:
#
# 1. pandas → CTable : DataFrame.to_arrow() + CTable.from_arrow()
# 2. CTable.save() : write in-memory CTable to disk
# 3. CTable.load() : read disk table back into RAM
# 4. CTable → pandas : CTable.to_arrow().to_pandas()
#
# Plus the combined full round-trip (steps 1-4) is shown at the end.
#
# Each measurement is the minimum of NRUNS repetitions to reduce noise.
# Schema: id (int64), score (float64), active (bool), label (string ≤16).

import os
import shutil
from time import perf_counter

import numpy as np
import pandas as pd
import pyarrow as pa

from blosc2 import CTable

NRUNS = 3
TABLE_DIR = "saved_ctable/bench_pandas"
SIZES = [1_000, 10_000, 100_000, 1_000_000]


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def sep(title: str) -> None:
print(f"\n{'─' * 60}")
print(f" {title}")
print(f"{'─' * 60}")


def tmin(fn, n: int = NRUNS) -> float:
"""Minimum elapsed time (s) over *n* calls of *fn*."""
best = float("inf")
for _ in range(n):
t0 = perf_counter()
fn()
best = min(best, perf_counter() - t0)
return best


def clean(path: str = TABLE_DIR) -> None:
if os.path.exists(path):
shutil.rmtree(path)
os.makedirs(path, exist_ok=True)


def make_dataframe(n: int) -> pd.DataFrame:
rng = np.random.default_rng(42)
return pd.DataFrame({
"id": np.arange(n, dtype=np.int64),
"score": rng.uniform(0, 100, n).astype(np.float64),
"active": rng.integers(0, 2, n, dtype=bool),
"label": [f"r{i % 10000:05d}" for i in range(n)],
})


# ---------------------------------------------------------------------------
# Section 1: pandas → CTable (in-memory)
# ---------------------------------------------------------------------------

sep("1. pandas → CTable (from_arrow, in-memory)")
print(f"{'rows':>12} {'pandas→arrow (s)':>18} {'arrow→ctable (s)':>18} {'total (s)':>12}")
print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}")

ctables: dict[int, CTable] = {} # keep for steps 2 & 4

for N in SIZES:
df = make_dataframe(N)

def bench_to_arrow(df=df):
return pa.Table.from_pandas(df, preserve_index=False)

def bench_from_arrow(df=df):
at = pa.Table.from_pandas(df, preserve_index=False)
return CTable.from_arrow(at)

t_pa = tmin(bench_to_arrow)
t_ct = tmin(bench_from_arrow) - t_pa # from_arrow only
t_tot = t_pa + t_ct

# Keep one CTable for later steps
at = pa.Table.from_pandas(df, preserve_index=False)
ctables[N] = CTable.from_arrow(at)

print(f"{N:>12,} {t_pa:>18.4f} {t_ct:>18.4f} {t_tot:>12.4f}")


# ---------------------------------------------------------------------------
# Section 2: CTable.save() (in-memory → disk)
# ---------------------------------------------------------------------------

sep("2. CTable.save() (in-memory → disk)")
print(f"{'rows':>12} {'save (s)':>14} {'compressed':>12} {'ratio':>8}")
print(f"{'----':>12} {'--------':>14} {'----------':>12} {'-----':>8}")

for N in SIZES:
t = ctables[N]
path = os.path.join(TABLE_DIR, f"ct_{N}")

def bench_save(t=t, path=path):
if os.path.exists(path):
shutil.rmtree(path)
t.save(path, overwrite=True)

elapsed = tmin(bench_save)
# Final state for size info
t.save(path, overwrite=True)
cbytes = t.cbytes
nbytes = t.nbytes
ratio = nbytes / cbytes if cbytes > 0 else float("nan")

def _fmt(n):
if n < 1024**2:
return f"{n / 1024:.1f} KB"
return f"{n / 1024**2:.1f} MB"

print(f"{N:>12,} {elapsed:>14.4f} {_fmt(cbytes):>12} {ratio:>7.2f}x")


# ---------------------------------------------------------------------------
# Section 3: CTable.load() (disk → in-memory)
# ---------------------------------------------------------------------------

sep("3. CTable.load() (disk → in-memory)")
print(f"{'rows':>12} {'load (s)':>14}")
print(f"{'----':>12} {'--------':>14}")

for N in SIZES:
path = os.path.join(TABLE_DIR, f"ct_{N}")

def bench_load(path=path):
return CTable.load(path)

elapsed = tmin(bench_load)
print(f"{N:>12,} {elapsed:>14.4f}")


# ---------------------------------------------------------------------------
# Section 4: CTable → pandas (to_arrow → to_pandas)
# ---------------------------------------------------------------------------

sep("4. CTable → pandas (to_arrow + to_pandas)")
print(f"{'rows':>12} {'ctable→arrow (s)':>18} {'arrow→pandas (s)':>18} {'total (s)':>12}")
print(f"{'----':>12} {'----------------':>18} {'----------------':>18} {'---------':>12}")

for N in SIZES:
t = ctables[N]
at_cache = t.to_arrow() # pre-convert once so we can time each step cleanly

def bench_to_arrow_ct(t=t):
return t.to_arrow()

def bench_to_pandas(at=at_cache):
return at.to_pandas()

t_arr = tmin(bench_to_arrow_ct)
t_pd = tmin(bench_to_pandas)
t_tot = t_arr + t_pd

print(f"{N:>12,} {t_arr:>18.4f} {t_pd:>18.4f} {t_tot:>12.4f}")


# ---------------------------------------------------------------------------
# Section 5: Full round-trip (pandas → CTable → disk → load → pandas)
# ---------------------------------------------------------------------------

sep("5. Full round-trip (pandas → CTable → save → load → pandas)")
print(f"{'rows':>12} {'round-trip (s)':>16}")
print(f"{'----':>12} {'---------------':>16}")

for N in SIZES:
df = make_dataframe(N)
path = os.path.join(TABLE_DIR, f"rt_{N}")

def bench_roundtrip(df=df, path=path):
# pandas → CTable
at = pa.Table.from_pandas(df, preserve_index=False)
t = CTable.from_arrow(at)
# save to disk
t.save(path, overwrite=True)
# load back
t2 = CTable.load(path)
# CTable → pandas
return t2.to_arrow().to_pandas()

elapsed = tmin(bench_roundtrip)
print(f"{N:>12,} {elapsed:>16.4f}")


# Cleanup
clean()
print()
Loading
Loading